In [1]:
import os
import time
import pickle
import itertools
import pandas as pd
from tqdm import tqdm
from iml_group_proj.config import RANDOM_STATE
from iml_group_proj.evaluation import evaluate_many
from iml_group_proj.data.utils import load_libofc_df
from iml_group_proj.data.preprocess import AverageEmbeddingsPerRecord, DataMode
from iml_group_proj.data.embeddings import EmbeddingsDataLoader, EmbeddingsType
from iml_group_proj.trainer import train_model, maybe_load_model

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier as OVC

%load_ext autoreload
%autoreload 2

In [2]:
DATA_PATH = "../github_data"
CACHE_DIR = "../_cache/"
def experiment_to_file_path(experiment, model_name):
    return os.path.join(CACHE_DIR, f"{experiment[0]}_{experiment[1]}_{model_name}.pk")
        
def get_experiment_result(embeddings_type, data_mode, models):
    print(f"Starting experiment with {embeddings_type} embeddings with {data_mode}")
    data = EmbeddingsDataLoader.load(DATA_PATH, embeddings_type)
    X_train, y_train, X_test, y_test = AverageEmbeddingsPerRecord.prep(data, embeddings_type, data_mode)
    
    trained_models = []
    for model in models:
        trained_model = train_model(
            model,
            X_train,
            y_train,
            {"embeddings_type": embeddings_type, "data": data_mode},
            experiment_to_file_path((embeddings_type, data_mode), model[2])
        )
        
        trained_models.append(trained_model)
    print("Evaluating")
    result_df = evaluate_many(trained_models, X_train, y_train, X_test, y_test)

    return result_df, trained_models

In [3]:
## Defining each experiment settings
embeddings_types = [
    EmbeddingsType.TFIDF, 
    EmbeddingsType.W2V,
    EmbeddingsType.BERT_XS,
    EmbeddingsType.BERT_S
]
data_types = [DataMode.title_only, DataMode.synopsis_only, DataMode.both_title_synopsis]

experiments = list(itertools.product(embeddings_types, data_types))

models = [
        (MLPClassifier(random_state=1, max_iter=250, hidden_layer_sizes=(600, 600), early_stopping=True), None, 'MLP'),
        (SVC(C=100, kernel="rbf", gamma=0.001, random_state=RANDOM_STATE), None, "SVC"),
        # (OVC(SVC(C=100, kernel="rbf", gamma=0.001, random_state=RANDOM_STATE)), None, "OVC-SVC"),
        (GaussianNB(), None,'NaiveBayes'),
]

In [None]:
result_dfs = []
trained_models_list = []
for (embeddings_type, data_type) in tqdm(experiments):
    result_df, trained_models = get_experiment_result(embeddings_type, data_type, models)
    print(result_df)
    trained_models_list.extend(trained_models)
    result_dfs.append(result_df)

  0%|                                                                                                                                                                   | 0/12 [00:00<?, ?it/s]

Starting experiment with tfidf embeddings with title
Model found at ../_cache/tfidf_title_MLP.pk, skip training flow...
Model found at ../_cache/tfidf_title_SVC.pk, skip training flow...
Model found at ../_cache/tfidf_title_NaiveBayes.pk, skip training flow...
Evaluating


In [None]:
result_dfs

In [None]:
#final_result_df = pd.concat([pd.DataFrame(r) for e in result_dfs])
final_result_df = pd.concat(result_dfs)
final_result_df.to_csv("../_output/experiments_result.csv")

In [None]:
#final_result_df[~final_result_df["is_train"]]

In [None]:
print("TEST")