# IMPACT experiments
### 1. Init
#### 1.1. Import libraries

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../../")

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from IMPACT import utils
utils.set_seed(0)
from IMPACT import dataset
from IMPACT import model
import optuna
import logging
import gc
import json
import torch
import pandas as pd
from importlib import reload

#### 1.2. Start tensorboard

In [None]:
from tensorboard import notebook
%load_ext tensorboard
%tensorboard --reuse=False --logdir /home/arthurb/Programmation/liriscat/experiments/tensorboard --load_fast=false --reload_interval=1 

print(notebook.list())
# access tensorboard at : http://localhost:6006

#### 1.3. Set up the loggers

In [None]:
utils.setuplogger(verbose = True, log_name="IMPACT_postcovid")

#### 1.5. Parametrize the datasets

In [None]:
# choose dataset here
dataset_name = 'postcovid'
version= ""#"_small"
# modify config here

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

config = {
    
    # General params
    'seed' : 0,
    
    # Saving params
    'load_params': False,
    'save_params': False,
    'embs_path' : '../embs/'+str(dataset_name),
    'params_path' :'../ckpt/'+str(dataset_name),
    
    # training mode
    'early_stopping' : True, 
    'fast_training' : True, # (Only taken in account if early_stopping == true) If true, doesn't compute valid rmse
    'low_mem': False,
    'metrics' : ['rmse'], #'rmse', 'mae', 'r2', ma_acc'
    'valid_metric': 'rmse', #'rmse', 'mae', 'r2', ma_acc'
    
    # Learning params
    'learning_rate': 0.001,
    'batch_size': 2048,
    'num_epochs': 200,
    'num_dim': 10,
    'eval_freq' : 1,
    'patience' : 30,
    'device': device,
    'lambda' : 7.7e-6,
    'tensorboard': False,
    'flush_freq' : True,

}
concept_map = json.load(open(f'../datasets/{dataset_name}/concept_map.json', 'r'))
concept_map = {int(k):[int(x) for x in v] for k,v in concept_map.items()}
metadata = json.load(open(f'../datasets/{dataset_name}/metadata.json', 'r'))
utils.set_seed(config['seed'])
dataset_name += version
logging.info(f'#### {dataset_name} ####')
logging.info(f'#### config : {config} ####')

### 2. CDM Hyperparameter search

#### 2.1. Sequential

In [None]:
reload(utils)
reload(model)
reload(dataset)


seed = 0
utils.set_seed(0)

config['seed'] = seed
config['early_stopping'] = True
config['esc'] = 'error'#'objectives' #'loss' 'delta_objectives'
config['num_epochs']=200
config['eval_freq']=1
config['patience']=30

config['verbose_early_stopping'] = False
config["tensorboard"] = False
config['flush_freq'] = False
config['save_params']= False
config['disable_tqdm'] = True


    
def load_dataset(dataset_name : str) :
    
    gc.collect()
    torch.cuda.empty_cache()

    # read datasets
    i_fold = 0
    concept_map = json.load(open(f'../datasets/{dataset_name}/concept_map.json', 'r'))
    concept_map = {int(k):[int(x) for x in v] for k,v in concept_map.items()}
    metadata = json.load(open(f'../datasets/{dataset_name}/metadata.json', 'r'))
    train_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{dataset_name}_train_quadruples_vert_{i_fold}.csv',
                             encoding='utf-8').to_records(index=False,
                                                          column_dtypes={'student_id': int, 'item_id': int,"dimension_id":int,
                                                                         "correct": float,"dimension_id":int})
    valid_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{dataset_name}_valid_quadruples_vert_{i_fold}.csv',
                                 encoding='utf-8').to_records(index=False,
                                                              column_dtypes={'student_id': int, 'item_id': int,"dimension_id":int,
                                                                             "correct": float,"dimension_id":int})
    
    train_data = dataset.LoaderDataset(train_quadruplets, concept_map, metadata)
    valid_data = dataset.LoaderDataset(valid_quadruplets, concept_map, metadata)
    
    return train_data,valid_data,concept_map,metadata


def objective(trial):
    
    gc.collect()
    torch.cuda.empty_cache()

    lr = trial.suggest_float('learning_rate', 1e-5, 5e-2, log=True)
    lambda_param = trial.suggest_float('lambda', 1e-7, 5e-4, log=True)
    #num_responses = trial.suggest_int('num_responses', 9,13)

    config['learning_rate'] = lr
    config['lambda'] = lambda_param
    config['num_responses'] = 12
    
    algo = model.IMPACT(**config)
        
    # Init model
    algo.init_model(train_data, valid_data)

    # train model ----
    algo.train(train_data, valid_data)
    
    best_valid_metric = algo.best_valid_metric
    
    logging.info("-------Trial number : "+str(trial.number)+"\nBest epoch : "+str(algo.best_epoch)+"\nValues : ["+str(best_valid_metric)+"]\nParams : "+str(trial.params))
    
    del algo.model
    del algo   
    
    gc.collect()
    torch.cuda.empty_cache()
                
    return best_valid_metric

In [None]:
dataset_name = "postcovid"
logging.info(dataset_name)
train_data,valid_data,concept_map,metadata = load_dataset(dataset_name)

study = optuna.create_study(
    directions=["minimize"],  # Specify directions for each objective
)
gc.collect()
torch.cuda.empty_cache()
study.optimize(objective, n_trials=6, n_jobs=1, gc_after_trial=True)

# Analyze the results
## requirements : plotly, nbformat
pareto_trials = study.best_trials

logging.info(f"Best trial for {dataset_name} : {study.best_trials}")
for trial in pareto_trials:
    logging.info(f"Trial #{trial.number}")
    logging.info(f"  Metric value: {trial.values}")
    #logging.info(f"  DOA: {trial.values[1]}")
    logging.info(f"  Params: {trial.params}")

#### 2.2. Parallelized

In [None]:
#ipcluster start --n=3
#ipcluster stop

reload(utils)
reload(model)
reload(dataset)
from ipyparallel import Client
import dill

cat_absolute_path = os.path.abspath('../../')

rc = Client()
rc[:].use_dill()
lview = rc.load_balanced_view()


rc[:].execute("import sys; sys.path.append('"+cat_absolute_path+"')")
print("sys.path.append("+cat_absolute_path+")")
with rc[:].sync_imports():
    import json
    from IMPACT import utils, model, dataset
    import logging
    import gc
    import torch

seed = 0
utils.set_seed(0)

config['seed'] = seed
config['early_stopping'] = True
config['esc'] = 'error'#'objectives' #'loss' 'delta_objectives'
config['num_epochs']=200
config['eval_freq']=1
config['patience']=30

config['verbose_early_stopping'] = False
config["tensorboard"] = False
config['flush_freq'] = False
config['save_params']= False
config['disable_tqdm'] = True



def load_dataset(dataset_name : str) :

    gc.collect()
    torch.cuda.empty_cache()

    # read datasets
    i_fold = 0
    concept_map = json.load(open(f'../datasets/{dataset_name}/concept_map.json', 'r'))
    concept_map = {int(k):[int(x) for x in v] for k,v in concept_map.items()}
    metadata = json.load(open(f'../datasets/{dataset_name}/metadata.json', 'r'))
    train_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{dataset_name}_train_quadruples_vert_{i_fold}.csv',
                             encoding='utf-8').to_records(index=False,
                                                          column_dtypes={'student_id': int, 'item_id': int,"dimension_id":int,
                                                                         "correct": float,"dimension_id":int})
    valid_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{dataset_name}_valid_quadruples_vert_{i_fold}.csv',
                                 encoding='utf-8').to_records(index=False,
                                                              column_dtypes={'student_id': int, 'item_id': int,"dimension_id":int,
                                                                             "correct": float,"dimension_id":int})

    train_data = dataset.LoaderDataset(train_quadruplets, concept_map, metadata)
    valid_data = dataset.LoaderDataset(valid_quadruplets, concept_map, metadata)

    return train_data,valid_data,concept_map,metadata

def launch_test(trial,train_data,valid_data,config) :

    gc.collect()
    torch.cuda.empty_cache()

    algo = model.IMPACT(**config)

    # Init model
    algo.init_model(train_data, valid_data)

    # train model ----
    algo.train(train_data, valid_data)

    best_valid_metric = algo.best_valid_metric

    logging.info("-------Trial number : "+str(trial.number)+"\nBest epoch : "+str(algo.best_epoch)+"\nValues : ["+str(best_valid_metric)+"]\nParams : "+str(trial.params))

    del algo.model
    del algo

    gc.collect()
    torch.cuda.empty_cache()

    return best_valid_metric


def objective(trial):

    lr = trial.suggest_float('learning_rate', 0.001, 0.01, log=True)
    lambda_param = trial.suggest_float('lambda', 1.2e-6, 1.6e-6, log=True)
    #num_responses = trial.suggest_int('num_responses', 11,13)

    config['learning_rate'] = lr
    config['lambda'] = lambda_param
    config['num_responses'] =12

    return lview.apply_async(launch_test,trial,train_data,valid_data, config).get()



In [None]:
%%time
dataset_name = "movielens"
logging.info(dataset_name)
train_data,valid_data,concept_map,metadata = load_dataset(dataset_name)

study = optuna.create_study(
    directions=["minimize"],  # Specify directions for each objective
)
gc.collect()
torch.cuda.empty_cache()
study.optimize(objective, n_trials=100, n_jobs=3, gc_after_trial=True)

# Analyze the results
## requirements : plotly, nbformat
pareto_trials = study.best_trials

logging.info(f"Best trial for {dataset_name} : {study.best_trials}")
for trial in pareto_trials:
    logging.info(f"Trial #{trial.number}")
    logging.info(f"  Metric value: {trial.values}")
    #logging.info(f"  DOA: {trial.values[1]}")
    logging.info(f"  Params: {trial.params}")

#### 2.3. Number of parameters computation

In [None]:
d_in=5
num_responses=13
metadata['num_item_id']*num_responses*d_in+metadata['num_user_id']*metadata['num_dimension_id']+metadata['num_dimension_id']*metadata['num_dimension_id']*d_in

### 3. CDM Prediction
#### 3.1. Parallel training and testing

In [None]:
#ipcluster start --n=4
#ipcluster stop

In [None]:
reload(utils)
reload(model)
reload(dataset)

import warnings
import numpy as np
from ipyparallel import Client
from IMPACT import utils
import dill

cat_absolute_path = os.path.abspath('../../')

rc = Client()
rc[:].use_dill()
lview = rc.load_balanced_view()

# Synchronize imports with all engines:
rc[:].execute("import sys; sys.path.append('"+cat_absolute_path+"')")
rc[:].execute("import os; os.chdir('./liriscat/experiments/notebook_examples')")

with rc[:].sync_imports():
    from IMPACT import utils, model, dataset

config["disable_tqdm"] = True
config["tensorboard"] = False
config['flush_freq'] = False
config['early_stopping'] = True
config['save_params']=True # Save all model parameters and save an array of the embeddings
config['verbose_early_stopping'] = False
config['esc'] = 'error'

def launch_training(seed:int,config:dict,train_data,valid_data,test_data, concept_map) :
    utils.set_seed(seed)
    config['seed'] = seed

    algo = model.IMPACT(**config)

    # Init model
    algo.init_model(train_data, valid_data)

    # train model ----
    algo.train(train_data, valid_data)

    emb = algo.model.users_emb.weight.detach().cpu().numpy()

    metrics = {"mae":[],"rmse":[], "pc-er" : []}

    metrics["pc-er"].append(algo.evaluate_emb(test_data,concept_map)['pc-er'])
    eval = algo.evaluate_test(test_data)
    metrics["rmse"].append(eval["rmse"].cpu().numpy().tolist())
    metrics["mae"].append(eval["mae"].cpu().numpy().tolist())

    return (metrics,emb)

def fold_test(i_fold : int, dataset_name:str, config : dict) :

    gc.collect()
    torch.cuda.empty_cache()

    # Dataset downloading for doa and rm
    warnings.filterwarnings("ignore", message="invalid value encountered in divide")
    warnings.filterwarnings("ignore", category=RuntimeWarning)

    concept_map = json.load(open(f'../datasets/{dataset_name}/concept_map.json', 'r'))
    concept_map = {int(k): [int(x) for x in v] for k, v in concept_map.items()}
    metadata = json.load(open(f'../datasets/{dataset_name}/metadata.json', 'r'))
    concept_array, concept_lens=utils.preprocess_concept_map(concept_map)

    # read datasets
    train_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{dataset_name}_train_quadruples_vert_{i_fold}.csv',
                                 encoding='utf-8').to_records(index=False,
                                                              column_dtypes={'student_id': int, 'item_id': int,
                                                                             "correct": float,"dimension_id":int})
    valid_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{dataset_name}_valid_quadruples_vert_{i_fold}.csv',
                                 encoding='utf-8').to_records(index=False,
                                                              column_dtypes={'student_id': int, 'item_id': int,
                                                                             "correct": float,"dimension_id":int})
    test_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{dataset_name}_test_quadruples_vert_{i_fold}.csv',
                                encoding='utf-8').to_records(index=False,
                                                             column_dtypes={'student_id': int, 'item_id': int,
                                                                            "correct": float,"dimension_id":int})

    train_data = dataset.LoaderDataset(train_quadruplets, concept_map, metadata)
    valid_data = dataset.LoaderDataset(valid_quadruplets, concept_map, metadata)
    test_data = dataset.LoaderDataset(test_quadruplets, concept_map, metadata)

    seeds_combinations = []
    for seed in range(3) :
        seeds_combinations.append((seed,lview.apply_async(launch_training,seed,config,train_data,valid_data,test_data, concept_map)))

    metrics = {"mae":[],"rmse":[], "pc-er" : [], "doa": [], 'rm' : []}

    for seed,async_result in seeds_combinations:
        metric, emb = async_result.get()
        # test model ----
        logging.info(f"Test done - seed : {seed}, i_fold : {i_fold}")
        for k in metric.keys():
            metrics[k].extend(metric[k])

        metrics["doa"].append(np.mean(utils.evaluate_doa(emb,test_data.log_tensor.cpu().numpy(),metadata,concept_map)))
        metrics["rm"].append(np.mean(utils.compute_rm_fold(emb,test_quadruplets, concept_array, concept_lens)))
        pd.DataFrame(emb).to_csv("../embs/"+dataset_name+"_IMPACT_cornac_Iter_fold"+str(i_fold)+"_seed_"+str(seed)+".csv",index=False,header=False)

    return metrics


def test(dataset_name:str, config : dict) :

    logging.info(f'#### {dataset_name} ####')
    logging.info(f'#### config : {config} ####')

    metrics = {"mae":[],"rmse":[], "pc-er" : [], "doa": [], 'rm' : []}

    fold_combinations = []
    for i_fold in range(5):
        fold_combinations.append(fold_test(i_fold,dataset_name, config))

    for metric in fold_combinations:
        for k in metrics.keys():
            metrics[k].extend(metric[k])

    df = pd.DataFrame(metrics)
    logging.info('rmse : {:.4f} +- {:.4f}'.format(df['rmse'].mean(), df['rmse'].std()))
    logging.info('mae : {:.4f} +- {:.4f}'.format(df['mae'].mean(), df['mae'].std()))
    logging.info('pc-er : {:.4f} +- {:.4f}'.format(df['pc-er'].mean(), df['pc-er'].std()))
    logging.info('doa : {:.4f} +- {:.4f}'.format(df['doa'].mean(), df['doa'].std()))
    logging.info('rm : {:.4f} +- {:.4f}'.format(df['rm'].mean(), df['rm'].std()))

    return metrics

In [None]:
%%time

dataset_name = "postcovid"
logging.info(dataset_name)
config['learning_rate'] = 0.02026
config['lambda'] = 1.2e-5
config['d_in'] = 4
config['num_responses'] = 12
metrics = test(dataset_name,config)

dataset_name = "movielens"
logging.info(dataset_name)
config['learning_rate'] = 0.02515
config['lambda'] = 2e-7
config['d_in'] = 10
config['num_responses'] = 12
metrics = test(dataset_name,config)

dataset_name = "portrait"
logging.info(dataset_name)
config['learning_rate'] = 0.04568
config['lambda'] = 2e-7
config['d_in'] = 6
config['num_responses'] = 12
metrics = test(dataset_name,config)

dataset_name = "promis"
logging.info(dataset_name)
config['learning_rate'] = 0.01227
config['lambda'] = 1e-7
config['d_in'] = 6
config['num_responses'] = 13
metrics = test(dataset_name,config)

#### 3.2. Sequential training and testing

In [None]:
import warnings
import numpy as np

gc.collect()
torch.cuda.empty_cache()

reload(utils)
reload(model)
reload(dataset)

config["disable_tqdm"] = True
config["tensorboard"] = False
config['flush_freq'] = False
config['early_stopping'] = True
config['save_params']=False # Save all model parameters and save an array of the embeddings
config['num_epochs']=5
config['verbose_early_stopping'] = False
config['esc'] = 'error'

def test(dataset_name:str, config : dict) :

    logging.info(f'#### {dataset_name} ####')
    logging.info(f'#### config : {config} ####')
    config['embs_path']='../embs/'+str(dataset_name)
    config['params_path']='../ckpt/'+str(dataset_name)

    metrics = {"mae":[],"rmse":[], "pc-er" : [], "doa": [], 'rm' : []}

    for i_fold in range(1):

        gc.collect()
        torch.cuda.empty_cache()

        # Dataset downloading for doa and rm
        warnings.filterwarnings("ignore", message="invalid value encountered in divide")
        warnings.filterwarnings("ignore", category=RuntimeWarning)

        concept_map = json.load(open(f'../datasets/{dataset_name}/concept_map.json', 'r'))
        concept_map = {int(k): [int(x) for x in v] for k, v in concept_map.items()}
        metadata = json.load(open(f'../datasets/{dataset_name}/metadata.json', 'r'))
        concept_array, concept_lens=utils.preprocess_concept_map(concept_map)

        # read datasets
        train_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{dataset_name}_train_quadruples_vert_{i_fold}.csv',
                                     encoding='utf-8').to_records(index=False,
                                                                  column_dtypes={'student_id': int, 'item_id': int,
                                                                                 "correct": float,"dimension_id":int})
        valid_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{dataset_name}_valid_quadruples_vert_{i_fold}.csv',
                                     encoding='utf-8').to_records(index=False,
                                                                  column_dtypes={'student_id': int, 'item_id': int,
                                                                                 "correct": float,"dimension_id":int})
        test_quadruplets = pd.read_csv(f'../datasets/2-preprocessed_data/{dataset_name}_test_quadruples_vert_{i_fold}.csv',
                                    encoding='utf-8').to_records(index=False,
                                                                 column_dtypes={'student_id': int, 'item_id': int,
                                                                                "correct": float,"dimension_id":int})

        train_data = dataset.LoaderDataset(train_quadruplets, concept_map, metadata)
        valid_data = dataset.LoaderDataset(valid_quadruplets, concept_map, metadata)
        test_data = dataset.LoaderDataset(test_quadruplets, concept_map, metadata)

        for seed in range(1):
    
            # Set the seed
            utils.set_seed(seed)
            config['seed'] = seed

            algo = model.IMPACT(**config)

            # Init model
            algo.init_model(train_data, valid_data)

            # train model ----
            algo.train(train_data, valid_data)

            # test model ----
             # test model ----
            metrics["pc-er"].append(algo.evaluate_emb(test_data,concept_map)['pc-er'])
            eval = algo.evaluate_test(test_data)
            metrics["rmse"].append(eval["rmse"].cpu().numpy())
            metrics["mae"].append(eval["mae"].cpu().numpy())
            emb = algo.model.users_emb.weight.detach().cpu().numpy()
            metrics["doa"].append(np.mean(utils.evaluate_doa(emb,test_data.log_tensor.cpu().numpy(),metadata,concept_map)))
            metrics["rm"].append(np.mean(utils.compute_rm_fold(emb,test_quadruplets, concept_array, concept_lens)))

            pd.DataFrame(emb).to_csv("../embs/"+dataset_name+"_IMPACT_cornac_Iter_fold"+str(i_fold)+"_seed_"+str(seed)+".csv",index=False,header=False)

    df = pd.DataFrame(metrics)
    logging.info('rmse : {:.4f} +- {:.4f}'.format(df['rmse'].mean(), df['rmse'].std()))
    logging.info('mae : {:.4f} +- {:.4f}'.format(df['mae'].mean(), df['mae'].std()))
    logging.info('pc-er : {:.4f} +- {:.4f}'.format(df['pc-er'].mean(), df['pc-er'].std()))
    logging.info('doa : {:.4f} +- {:.4f}'.format(df['doa'].mean(), df['doa'].std()))
    logging.info('rm : {:.4f} +- {:.4f}'.format(df['rm'].mean(), df['rm'].std()))

    return metrics

In [None]:
%%time
config['low_mem'] = False

dataset_name = "postcovid"
logging.info(dataset_name)
config['learning_rate'] = 0.02026
config['lambda'] = 1.2e-5
config['d_in'] = 4
config['num_responses'] = 12
metrics = test(dataset_name,config)

In [None]:
%%time

dataset_name = "assist0910"
logging.info(dataset_name)
config['learning_rate'] = 0.02026
config['lambda'] = 1.2e-5
config['d_in'] = 4
config['num_responses'] = 12
metrics = test(dataset_name,config)

In [None]:

dataset_name = "movielens"
logging.info(dataset_name)
config['learning_rate'] = 0.02515
config['lambda'] = 2e-7
config['d_in'] = 10
config['num_responses'] = 12
metrics = test(dataset_name,config)

dataset_name = "portrait"
logging.info(dataset_name)
config['learning_rate'] = 0.04568
config['lambda'] = 2e-7
config['d_in'] = 6
config['num_responses'] = 12
metrics = test(dataset_name,config)

dataset_name = "promis"
logging.info(dataset_name)
config['learning_rate'] = 0.01227
config['lambda'] = 1e-7
config['d_in'] = 6
config['num_responses'] = 13
metrics = test(dataset_name,config)