In [1]:
import os 

os.chdir("app/")

In [2]:
import pandas as pd

import json
import glob
import hydra
from hydra import initialize, compose

import torch

from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.datasets import MemoryMapDataset

from sklearn.model_selection import train_test_split

from utils.encode import encode_data
from utils.evaluation import bootstrap_eval

In [3]:
def evaluate_checkpoints(dataset):
    assert dataset in ["churn", "age"]

    df = pd.read_parquet(f"data/preprocessed_new/{dataset}.parquet")
    
    preprocessor = PandasDataPreprocessor(
        col_id="user_id",
        col_event_time="timestamp",
        event_time_transformation="dt_to_timestamp" if dataset == "churn" else "none",
        cols_category=["mcc_code"],
        cols_first_item=["global_target"]
    )

    data = preprocessor.fit_transform(df)

    val_size = 0.1
    test_size = 0.1

    train, val_test = train_test_split(data, test_size=test_size+val_size, random_state=42)
    val, test = train_test_split(val_test, test_size=test_size/(test_size+val_size), random_state=42)

    min_seq_len = 15 if dataset == "churn" else 25

    train_churn_ds = MemoryMapDataset(train + val, [SeqLenFilter(min_seq_len=min_seq_len)])
    test_churn_ds = MemoryMapDataset(test, [SeqLenFilter(min_seq_len=min_seq_len)])
    
    config_paths = glob.glob(f"checkpoints/{dataset}/**/*.yaml", recursive=True)
    
    results_dict = {}
    for path in config_paths:
        path_split = path.split("/")
        config_name = path_split[-1].replace(".yaml", "")
        path_to_config = "/".join(path_split[:-1])

        with initialize(version_base=None, config_path=path_to_config):
            cfg = compose(config_name=config_name)
            
        seq_encoder = hydra.utils.instantiate(cfg["seq_encoder"])
        seq_encoder.load_state_dict(torch.load(cfg["path_to_state_dict"]))

        X_train, y_train = encode_data(seq_encoder, train_churn_ds)
        X_test, y_test = encode_data(seq_encoder, test_churn_ds)

        results = bootstrap_eval(X_train, X_test, y_train, y_test)
        
        if not os.path.exists("results"):
            os.mkdir("results")
        
        results.to_csv(f"results/{config_name}.csv")
        
        results_dict[config_name] = results.agg(["mean", "std"]).to_dict()

    return results_dict

# Churn dataset

In [4]:
results_churn = evaluate_checkpoints("churn")

100%|██████████| 10/10 [00:40<00:00,  4.07s/it]
100%|██████████| 10/10 [00:40<00:00,  4.08s/it]
100%|██████████| 10/10 [00:39<00:00,  4.00s/it]
100%|██████████| 10/10 [00:38<00:00,  3.83s/it]


In [8]:
with open("results/churn.json", "w") as f:
    json.dump(results_churn, f)

# Age dataset

In [4]:
results_age = evaluate_checkpoints("age")

100%|██████████| 10/10 [06:46<00:00, 40.68s/it]
100%|██████████| 10/10 [07:22<00:00, 44.25s/it]
100%|██████████| 10/10 [06:58<00:00, 41.84s/it]
100%|██████████| 10/10 [06:45<00:00, 40.58s/it]


In [5]:
with open("results/age.json", "w") as f:
    json.dump(results_age, f)