In [1]:
import os 

os.chdir("app/")

In [14]:
import pandas as pd
import numpy as np

import json
import glob
import hydra
from hydra import initialize, compose

import torch

from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.datasets import MemoryMapDataset

from sklearn.model_selection import train_test_split

from utils.encode import encode_data
from utils.evaluation import bootstrap_eval

In [3]:
def evaluate_checkpoints(dataset):
    assert dataset in ["churn", "age"]

    df = pd.read_parquet(f"data/preprocessed_new/{dataset}.parquet")

    preprocessor = PandasDataPreprocessor(
        col_id="user_id",
        col_event_time="timestamp",
        event_time_transformation="dt_to_timestamp" if dataset == "churn" else "none",
        cols_category=["mcc_code"],
        cols_first_item=["global_target"]
    )

    data = preprocessor.fit_transform(df)

    val_size = 0.1
    test_size = 0.1

    train, val_test = train_test_split(data, test_size=test_size+val_size, random_state=42)
    val, test = train_test_split(val_test, test_size=test_size/(test_size+val_size), random_state=42)

    min_seq_len = 15 if dataset == "churn" else 25

    train_ds = MemoryMapDataset(train + val, [SeqLenFilter(min_seq_len=min_seq_len)])
    test_ds = MemoryMapDataset(test, [SeqLenFilter(min_seq_len=min_seq_len)])
    
    config_paths = glob.glob(f"checkpoints/{dataset}/**/*.yaml", recursive=True)
    
    results_dict = {}
    for path in config_paths:
        path_split = path.split("/")
        config_name = path_split[-1].replace(".yaml", "")
        path_to_config = "/".join(path_split[:-1])

        with initialize(version_base=None, config_path=path_to_config):
            cfg = compose(config_name=config_name)
            
        seq_encoder = hydra.utils.instantiate(cfg["seq_encoder"])
        seq_encoder.load_state_dict(torch.load(cfg["path_to_state_dict"]))

        X_train, y_train = encode_data(seq_encoder, train_ds)
        X_test, y_test = encode_data(seq_encoder, test_ds)

        results = bootstrap_eval(X_train, X_test, y_train, y_test)
        
        if not os.path.exists("results"):
            os.mkdir("results")
        
        results.to_csv(f"results/{config_name}.csv")
        
        results_dict[config_name] = results.agg(["mean", "std"]).to_dict()

    return results_dict

# Churn dataset

In [4]:
results_churn = evaluate_checkpoints("churn")

100%|██████████| 10/10 [00:40<00:00,  4.07s/it]
100%|██████████| 10/10 [00:40<00:00,  4.08s/it]
100%|██████████| 10/10 [00:39<00:00,  4.00s/it]
100%|██████████| 10/10 [00:38<00:00,  3.83s/it]


In [8]:
with open("results/churn.json", "w") as f:
    json.dump(results_churn, f)

# Age dataset

In [4]:
results_age = evaluate_checkpoints("age")

100%|██████████| 10/10 [06:46<00:00, 40.68s/it]
100%|██████████| 10/10 [07:22<00:00, 44.25s/it]
100%|██████████| 10/10 [06:58<00:00, 41.84s/it]
100%|██████████| 10/10 [06:45<00:00, 40.58s/it]


In [5]:
with open("results/age.json", "w") as f:
    json.dump(results_age, f)

# Default dataset

In [25]:
df = pd.read_parquet("data/preprocessed_new/default.parquet")

In [26]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="dt_to_timestamp",
    cols_category=["mcc_code"],
    cols_first_item=["global_target"]
)

In [27]:
data = preprocessor.fit_transform(df)

In [28]:
y = np.array([data[i]["global_target"] for i in range(len(data))])

In [29]:
val_size = 0.1
test_size = 0.1

train, val_test, y_train, y_val_test = train_test_split(data, y, stratify=y, test_size=test_size+val_size, random_state=42)
val, test, y_val, y_test = train_test_split(val_test, y_val_test, stratify=y_val_test, test_size=test_size/(test_size+val_size), random_state=42)

In [30]:
train_ds = MemoryMapDataset(train + val, [SeqLenFilter(min_seq_len=15)])
test_ds = MemoryMapDataset(test, [SeqLenFilter(min_seq_len=15)])

In [31]:
path_to_checkpoints = "checkpoints/default/ts2vec/"

with hydra.initialize(version_base=None, config_path=path_to_checkpoints):
    cfg = hydra.compose("ts2vec_default_date")

In [32]:
from sklearn.metrics import roc_auc_score, average_precision_score
from lightgbm import LGBMClassifier

res = []

for i in range(5):
    path = f"{path_to_checkpoints}{cfg['name']}_{i}.pth"
    
    seq_encoder = hydra.utils.instantiate(cfg["seq_encoder"])
    seq_encoder.load_state_dict(torch.load(path))

    X_train, y_train = encode_data(seq_encoder, train_ds)
    X_test, y_test = encode_data(seq_encoder, test_ds)

    lgbm = LGBMClassifier(
        n_estimators=500,
        boosting_type="gbdt",
        subsample=0.5,
        subsample_freq=1,
        learning_rate=0.02,
        feature_fraction=0.75,
        max_depth=6,
        lambda_l1=1,
        lambda_l2=1,
        min_data_in_leaf=50,
        random_state=42,
        n_jobs=8,
        verbose=-1
    )

    lgbm.fit(X_train, y_train)

    y_pred = lgbm.predict_proba(X_test)[:, 1]

    res.append(
        {
            "ROC-AUC": roc_auc_score(y_test, y_pred),
            "PR-AUC": average_precision_score(y_test, y_pred)
        }
    )

In [36]:
pd.DataFrame(res).agg(["mean", "std"])

Unnamed: 0,ROC-AUC,PR-AUC
mean,0.529326,0.053046
std,0.031451,0.010593
