# IMPACT paper experiments
### 1. Init
#### 1.1. Import libraries (necessary)

In [None]:
from numpy.lib.function_base import interp
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../../")

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from IMPACT import utils
utils.set_seed(0)
from IMPACT import dataset
from IMPACT import model
import optuna
import logging
import gc
import json
import torch
import pandas as pd
from importlib import reload

#### 1.2. Start tensorboard (optional)

In [None]:
from tensorboard import notebook
%load_ext tensorboard
%tensorboard --reuse=False --logdir ../tensorboard --load_fast=false --reload_interval=1

print(notebook.list())
# access tensorboard at : http://localhost:6006

#### 1.3. Set up the loggers (recommended)

In [None]:
utils.setuplogger(verbose = True, log_name="IMPACT_postcovid")

### 2. CDM Hyperparameter search

#### 2.1. Sequential version

In [None]:
reload(utils)
reload(model)
reload(dataset)

def load_dataset(config) :

    gc.collect()
    torch.cuda.empty_cache()

    # read datasets
    i_fold = 0
    concept_map = json.load(open(f'../datasets/{config["dataset_name"]}/concept_map.json', 'r'))
    concept_map = {int(k):[int(x) for x in v] for k,v in concept_map.items()}
    nb_modalities = torch.load(f'../datasets/{config["dataset_name"]}/nb_modalities.pkl',weights_only=True)
    metadata = json.load(open(f'../datasets/{config["dataset_name"]}/metadata.json', 'r'))
    train_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{config["dataset_name"]}_train_quadruples_vert_{i_fold}.csv',
                             encoding='utf-8').to_records(index=False,
                                                          column_dtypes={'student_id': int, 'item_id': int,"dimension_id":int,
                                                                         "correct": float,"dimension_id":int})
    valid_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{config["dataset_name"]}_valid_quadruples_vert_{i_fold}.csv',
                                 encoding='utf-8').to_records(index=False,
                                                              column_dtypes={'student_id': int, 'item_id': int,"dimension_id":int,
                                                                             "correct": float,"dimension_id":int})

    train_data = dataset.LoaderDataset(train_quadruplets, concept_map, metadata, nb_modalities)
    valid_data = dataset.LoaderDataset(valid_quadruplets, concept_map, metadata, nb_modalities)

    return train_data,valid_data,concept_map,metadata


def objective(trial):
    
    gc.collect()
    torch.cuda.empty_cache()

    lr = trial.suggest_float('learning_rate', 1e-5, 5e-2, log=True)
    lambda_param = trial.suggest_float('lambda', 1e-7, 1e-5, log=True)
    #num_responses = trial.suggest_int('num_responses', 9,13)

    config['learning_rate'] = lr
    config['lambda'] = lambda_param
    config['num_responses'] = 12
    
    algo = model.IMPACT(**config)
        
    # Init model
    algo.init_model(train_data, valid_data)

    # train model ----
    algo.train(train_data, valid_data)
    
    best_valid_metric = algo.best_valid_metric
    
    logging.info("-------Trial number : "+str(trial.number)+"\nBest epoch : "+str(algo.best_epoch)+"\nValues : ["+str(best_valid_metric)+"]\nParams : "+str(trial.params))
    
    del algo.model
    del algo   
    
    gc.collect()
    torch.cuda.empty_cache()
                
    return best_valid_metric

In [None]:
%%time
config = utils.generate_hs_config(dataset_name='postcovid', esc = 'error', valid_metric= 'rmse',metrics = ['rmse'])

utils.set_seed(config["seed"])
logging.info(config['dataset_name'])
train_data,valid_data,concept_map,metadata = load_dataset(config)

study = optuna.create_study(
    directions=["minimize"],  # Warning : specify directions for each objective (depends on the validation metric)
)
gc.collect()
torch.cuda.empty_cache()
study.optimize(objective, n_trials=100, n_jobs=1, gc_after_trial=True)

# Analyze the results
## requirements : plotly, nbformat
pareto_trials = study.best_trials

logging.info(f"Best trial for {config['dataset_name']} : {study.best_trials}")
for trial in pareto_trials:
    logging.info(f"Trial #{trial.number}")
    logging.info(f"  Metric value: {trial.values}")
    #logging.info(f"  DOA: {trial.values[1]}")
    logging.info(f"  Params: {trial.params}")

#### 2.2. Parallelized version

In [None]:
## Launch cluster with the following command at the root of the project :
#ipcluster start --n=3
#ipcluster stop

In [None]:
reload(utils)
reload(model)
reload(dataset)
from ipyparallel import Client
import dill

cat_absolute_path = os.path.abspath('../../')

rc = Client()
rc[:].use_dill()
lview = rc.load_balanced_view()


rc[:].execute("import sys; sys.path.append('"+cat_absolute_path+"')")
logging.info("sys.path.append("+cat_absolute_path+")")
with rc[:].sync_imports():
    import json
    from IMPACT import utils, model, dataset
    import logging
    import gc
    import torch

def load_dataset(config) :

    gc.collect()
    torch.cuda.empty_cache()

    # read datasets
    train_data, valid_data, test_data = utils.prepare_dataset(config, i_fold=0)

    return train_data,valid_data,concept_map,metadata

def launch_test(trial,train_data,valid_data,config) :

    gc.collect()
    torch.cuda.empty_cache()

    algo = model.IMPACT(**config)

    # Init model
    algo.init_model(train_data, valid_data)

    # train model ----
    algo.train(train_data, valid_data)

    best_valid_metric = algo.best_valid_metric

    logging.info("-------Trial number : "+str(trial.number)+"\nBest epoch : "+str(algo.best_epoch)+"\nValues : ["+str(best_valid_metric)+"]\nParams : "+str(trial.params))

    del algo.model
    del algo

    gc.collect()
    torch.cuda.empty_cache()

    return best_valid_metric


def objective(trial):

    lr = trial.suggest_float('learning_rate', 1e-5, 5e-2, log=True)
    lambda_param = trial.suggest_float('lambda', 1e-7, 1e-5, log=True)
    #num_responses = trial.suggest_int('num_responses', 11,13)

    config['learning_rate'] = lr
    config['lambda'] = lambda_param
    config['num_responses'] =12

    return lview.apply_async(launch_test,trial,train_data,valid_data, config).get()



In [None]:
%%time
config = utils.generate_hs_config(dataset_name='postcovid', esc = 'error', valid_metric= 'rmse',metrics = ['rmse'])

utils.set_seed(config["seed"])
logging.info(config['dataset_name'])
train_data,valid_data,concept_map,metadata = load_dataset(config)

study = optuna.create_study(
    directions=["minimize"], # Warning : specify directions for each objective (depends on the validation metric)
)
gc.collect()
torch.cuda.empty_cache()
study.optimize(objective, n_trials=100, n_jobs=3, gc_after_trial=True)

# Analyze the results
## requirements : plotly, nbformat
pareto_trials = study.best_trials

logging.info(f"Best trial for {config['dataset_name']} : {study.best_trials}")
for trial in pareto_trials:
    logging.info(f"Trial #{trial.number}")
    logging.info(f"  Metric value: {trial.values}")
    #logging.info(f"  DOA: {trial.values[1]}")
    logging.info(f"  Params: {trial.params}")

#### 2.3. Number of parameters computation

In [None]:
d_in=5
num_responses=13
metadata['num_item_id']*num_responses*d_in+metadata['num_user_id']*metadata['num_dimension_id']+metadata['num_dimension_id']*metadata['num_dimension_id']*d_in

### 3. CDM Prediction

#### 3.1. Training and testing, sequential version

In [None]:
import warnings
import numpy as np

gc.collect()
torch.cuda.empty_cache()

reload(utils)
reload(model)
reload(dataset)

def test(config : dict) :

    logging.info(f'#### {config["dataset_name"]} ####')
    logging.info(f'#### config : {config} ####')
    config['embs_path']='../embs/'+str(config["dataset_name"])
    config['params_path']='../ckpt/'+str(config["dataset_name"])

    pred_metrics = {m:[] for m in config['pred_metrics']}
    profile_metrics = {m:[] for m in config['profile_metrics']}

    for i_fold in range(5):

        gc.collect()
        torch.cuda.empty_cache()

        # Dataset downloading for doa and rm
        warnings.filterwarnings("ignore", message="invalid value encountered in divide")
        warnings.filterwarnings("ignore", category=RuntimeWarning)

        train_data, valid_data, test_data = utils.prepare_dataset(config, i_fold=i_fold)

        for seed in range(3):
    
            # Set the seed
            utils.set_seed(seed)
            config['seed'] = seed

            algo = model.IMPACT(**config)

            # Init model
            algo.init_model(train_data, valid_data)

            # train model ----
            algo.train(train_data, valid_data)

            # test model ----

            eval = algo.evaluate_predictions(test_data)
            [pred_metrics[m].append(eval[m]) for m in pred_metrics.keys()]

            emb = algo.model.users_emb.weight.detach().cpu().numpy()
            eval = algo.evaluate_profiles(test_data)
            [profile_metrics[m].append(eval[m]) for m in profile_metrics.keys()]
            pd.DataFrame(emb).to_csv("../embs/"+config["dataset_name"]+"_IMPACT_cornac_Iter_fold"+str(i_fold)+"_seed_"+str(seed)+".csv",index=False,header=False)


    df_pred = pd.DataFrame(pred_metrics)
    for m in pred_metrics.keys():
        logging.info(f'{m} : {df_pred[m].mean()} +- {df_pred[m].std()}')

    df_interp = pd.DataFrame(profile_metrics)
    for m in profile_metrics.keys():
        logging.info(f'{m} : {df_interp[m].mean()} +- {df_interp[m].std()}')

    return df_pred,df_interp

In [None]:
%%time
config = utils.generate_eval_config(esc = 'error', valid_metric= 'rmse', pred_metrics = ['rmse', 'mae'], profile_metrics = ['doa', 'pc-er', 'rm'], save_params=False)
utils.set_seed(config["seed"])

config["dataset_name"] = "postcovid"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.02026
config['lambda'] = 1.2e-5
config['d_in'] = 4
config['num_responses'] = 12
pred_metrics,df_interp = test(config)

config["dataset_name"] = "assist0910"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.02705
config['lambda'] = 1.0e-6
config['num_responses'] = 12
pred_metrics,df_interp = test(config)

config["dataset_name"] = "movielens"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.02515
config['lambda'] = 2e-7
config['d_in'] = 10
config['num_responses'] = 12
pred_metrics,df_interp = test(config)

config["dataset_name"] = "portrait"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.04568
config['lambda'] = 2e-7
config['d_in'] = 6
config['num_responses'] = 12
pred_metrics,df_interp = test(config)

config["dataset_name"] = "promis"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.01227
config['lambda'] = 1e-7
config['d_in'] = 6
config['num_responses'] = 13
pred_metrics,df_interp = test(config)

#### 3.2. Training and testing, parallel version

In [None]:
## Launch cluster at the root of project the with the following commands :
# ipcluster start --n=3
# ipcluster stop

In [None]:
reload(utils)
reload(model)
reload(dataset)

import warnings
import numpy as np
from ipyparallel import Client
from IMPACT import utils
import dill

cat_absolute_path = os.path.abspath('../../')

rc = Client()
rc[:].use_dill()
lview = rc.load_balanced_view()

# Synchronize imports with all engines:
rc[:].execute("import sys; sys.path.append('"+cat_absolute_path+"')")
rc[:].execute("import os; os.chdir('./experiments/notebook_examples')")

with rc[:].sync_imports():
    from IMPACT import utils, model, dataset

def launch_training(seed:int,config:dict,train_data,valid_data,test_data) :
    utils.set_seed(seed)
    config['seed'] = seed

    algo = model.IMPACT(**config)

    # Init model
    algo.init_model(train_data, valid_data)

    # train model ----
    algo.train(train_data, valid_data)

    pred_metrics = {m:[] for m in config['pred_metrics']}
    profile_metrics = {m:[] for m in config['profile_metrics']}

    eval = algo.evaluate_predictions(test_data)
    [pred_metrics[m].append(eval[m]) for m in pred_metrics.keys()]

    emb = algo.model.users_emb.weight.detach().cpu().numpy()
    eval = algo.evaluate_profiles(test_data)
    [profile_metrics[m].append(eval[m]) for m in profile_metrics.keys()]

    return (pred_metrics,profile_metrics,emb)

def fold_test(i_fold : int, config : dict) :

    gc.collect()
    torch.cuda.empty_cache()

    # Dataset downloading for doa and rm
    warnings.filterwarnings("ignore", message="invalid value encountered in divide")
    warnings.filterwarnings("ignore", category=RuntimeWarning)

    train_data, valid_data, test_data = utils.prepare_dataset(config, i_fold=i_fold)

    pred_metrics = {m:[] for m in config['pred_metrics']}
    profile_metrics = {m:[] for m in config['profile_metrics']}

    seeds_combinations = []
    for seed in range(3) :
        seeds_combinations.append((seed,lview.apply_async(launch_training,seed,config,train_data,valid_data,test_data)))

    for seed,async_result in seeds_combinations:
        pred_metric, profile_metric, emb = async_result.get()
        # test model ----
        logging.info(f"Test done - seed : {seed}, i_fold : {i_fold}")
        for k in pred_metrics.keys():
            pred_metrics[k].extend(pred_metric[k])

        for k in profile_metrics.keys():
            profile_metrics[k].extend(profile_metric[k])

        pd.DataFrame(emb).to_csv("../embs/"+config["dataset_name"]+"_IMPACT_cornac_Iter_fold"+str(i_fold)+"_seed_"+str(seed)+".csv",index=False,header=False)

    return pred_metrics,profile_metrics


def test(config : dict) :

    logging.info(f'#### {config["dataset_name"]} ####')
    logging.info(f'#### config : {config} ####')

    pred_metrics = {m:[] for m in config['pred_metrics']}
    profile_metrics = {m:[] for m in config['profile_metrics']}

    fold_combinations = []
    for i_fold in range(5):
        fold_combinations.append(fold_test(i_fold,config))

    for pred_metrics_i,profile_metrics in fold_combinations:
        for k in pred_metrics.keys():
            pred_metrics[k].extend(pred_metrics_i[k])
        for k in profile_metrics.keys():
            profile_metrics[k].extend(profile_metrics[k])

    df_pred = pd.DataFrame(pred_metrics)
    for m in pred_metrics.keys():
        logging.info(f'{m} : {df_pred[m].mean()} +- {df_pred[m].std()}')

    df_interp = pd.DataFrame(profile_metrics)
    for m in profile_metrics.keys():
        logging.info(f'{m} : {df_interp[m].mean()} +- {df_interp[m].std()}')

    return pred_metrics,df_interp

In [None]:
%%time
config = utils.generate_eval_config(esc = 'error', valid_metric= 'rmse', pred_metrics = ['rmse', 'mae'], profile_metrics = ['doa', 'pc-er', 'rm'], save_params=False)
utils.set_seed(config["seed"])

config["dataset_name"] = "postcovid"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.02026
config['lambda'] = 1.2e-5
config['d_in'] = 4
config['num_responses'] = 12
pred_metrics,df_interp = test(config)

config["dataset_name"] = "assist0910"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.02705
config['lambda'] = 1.0e-6
config['num_responses'] = 12
pred_metrics,df_interp = test(config)

config["dataset_name"] = "movielens"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.02515
config['lambda'] = 2e-7
config['d_in'] = 10
config['num_responses'] = 12
pred_metrics,df_interp = test(config)

config["dataset_name"] = "portrait"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.04568
config['lambda'] = 2e-7
config['d_in'] = 6
config['num_responses'] = 12
pred_metrics,df_interp = test(config)

config["dataset_name"] = "promis"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.01227
config['lambda'] = 1e-7
config['d_in'] = 6
config['num_responses'] = 13
pred_metrics,df_interp = test(config)