In [32]:
import time
import itertools
import pandas as pd
from iml_group_proj.config import RANDOM_STATE
from iml_group_proj.evaluation import evaluate_many
from iml_group_proj.data.utils import load_libofc_df
from iml_group_proj.data.preprocess import AverageEmbeddingsPerRecord, DataMode
from iml_group_proj.data.embeddings import EmbeddingsDataLoader, EmbeddingsType
from iml_group_proj_back.train_models import train_models

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier as OVC

In [33]:
# %load_ext autoreload
# %autoreload 2


# DATA_PATH = "../github_data"
# # List of Embeddings or PCA-ed tf-idf. 
# embeddings_type = "tfidf"
# print(f"Starting experiment with {embeddings_type} embeddings")
# data = EmbeddingsDataLoader.load(DATA_PATH, embeddings_type)
# X_train, y_train, X_test, y_test = AverageEmbeddingsPerRecord.prep(data, embeddings_type, 0)

# print(X_train.shape)

In [30]:
DATA_PATH = "../github_data"
# List of Embeddings or PCA-ed tf-idf. 

def get_experiment_result(embeddings_type, data_mode, models):
    print(f"Starting experiment with {embeddings_type} embeddings")
    data = EmbeddingsDataLoader.load(DATA_PATH, embeddings_type)
    X_train, y_train, X_test, y_test = AverageEmbeddingsPerRecord.prep(data, embeddings_type, data_mode)
    
    trained_models = train_models(models, X_train, y_train, {"embeddings_type": embeddings_type, "data": data_mode})
    
    # Can move the evaluation outside so that we can update the evaluation methods
    result_df = evaluate_many(trained_models, X_train, y_train, X_test, y_test)

    return result_df

In [31]:
embeddings_types = [EmbeddingsType.TFIDF, EmbeddingsType.W2V, EmbeddingsType.BERT]
data_types = [DataMode.title_only, DataMode.synopsis_only, DataMode.both_title_synopsis]

experiments = list(itertools.product(embeddings_types, data_types))

models = [
        (MLPClassifier(random_state=1, max_iter=100), None, 'MLP_100'),
        (SVC(C=100, kernel="rbf", gamma=0.001, random_state=RANDOM_STATE), None, "SVC"),
        # (OVC(SVC(C=100, kernel="rbf", gamma=0.001, random_state=RANDOM_STATE)), None, "OVC-SVC"),
        (GaussianNB(), None,'NaiveBayes'),
]

In [None]:
result_dfs = []
for (embeddings_type, data_type) in experiments:
    result_df = get_experiment_result(embeddings_type, data_type, models)
    print(result_df.head(100))
    result_dfs.append(result_df)

In [None]:
final_result_df = pd.concat(result_dfs)
final_result_df