# Experiments for CDM exclusively

In [6]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

from importlib import reload


import DBPR
from DBPR.dataset import *
from DBPR import utils
from DBPR import model
import sys
import json
import logging
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ModuleNotFoundError: No module named 'CAT'

In [14]:
def setuplogger():
    root = logging.getLogger()
    root.setLevel(logging.INFO)
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter("[%(levelname)s %(asctime)s] %(message)s")
    formatter.default_time_format = "%M:%S"
    formatter.default_msec_format = ""
    handler.setFormatter(formatter)
    for handler in root.handlers[:]:
        root.removeHandler(handler)
    root.addHandler(handler)    
setuplogger()

In [15]:
# choose dataset here
dataset_name = 'portrait'
# modify config here
config = {
    'learning_rate': 0.001,
    'batch_size': 2048,
    'num_epochs': 1,
    'num_dim': 10, # for IRT or MIRT todo : is it necessary as we use concepts knowledge number as embedding dimension ?
    'eval_freq' : 1,
    'patience' : 6,
    'device': 'cpu',
    'lambda' : 1e-7,
    # for NeuralCD
    'prednet_len1': 128,
    'prednet_len2': 64,
    'best_params_path':'../ckpt/',
    #For GCCD
    'num_layers': 0,
    'version': 'pair',
    'p_dropout': 0,
    'low_mem_mode' : True,
    'user_nbrs_n' : 10,
    'item_nbrs_n' : 5
}
concept_map = json.load(open(f'../datasets/{dataset_name}/concept_map.json', 'r'))
concept_map = {int(k):[int(x) for x in v] for k,v in concept_map.items()}
metadata = json.load(open(f'../datasets/{dataset_name}/metadata.json', 'r'))

## CDM Training

In [45]:
# grid_search
for p_dropout in [0]:
    config['p_dropout'] = p_dropout
    for num_layers in [1]:
        config['num_layers'] = num_layers
        # read datasets
        algo = model.GCCD(**config)
        metrics = []
            
        for i_fold in range(1) :
                           
            train_triplets = pd.read_csv(f'../datasets/{dataset_name}/train_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
            valid_triplets = pd.read_csv(f'../datasets/{dataset_name}/valid_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
            test_triplets = pd.read_csv(f'../datasets/{dataset_name}/test_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
            
            
            train_data = dataset.LoaderDataset(train_triplets, concept_map, metadata)
            valid_data = dataset.LoaderDataset(valid_triplets, concept_map, metadata) 
            test_data = dataset.LoaderDataset(test_triplets, concept_map,metadata) 
            # define model here
        
            for seed in range(1) : 
                # Set the seed
                utils.set_seed(seed)
                
                # train model
                algo.init_model(train_data, None)
                algo.train(train_data, valid_data,test_data)
                test = algo.evaluate_test(test_data)
                print(f'test : {test}')
                metrics.append(test)
                
        df = pd.DataFrame(metrics)
        print(f'dropout : {config['p_dropout']}; num_layers : {config['num_layers']}')
        print('rmse : {:.4f} +- {:.4f}'.format(df['rmse'].mean(),df['rmse'].std()))

Low memory mode activated
[INFO 02:35] train on cpu
[INFO 02:35] -- START Training --


 71%|███████   | 54/76 [00:02<00:01, 21.63it/s]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x798b7a36ac90>>
Traceback (most recent call last):
  File "/home/arthurb/anaconda3/envs/liriscat/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
100%|██████████| 76/76 [00:03<00:00, 19.52it/s]
 29%|██▉       | 22/76 [00:01<00:02, 20.24it/s]


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(metrics)
print('rmse : {:.4f} +- {:.4f}'.format(df['rmse'].mean(),df['rmse'].std()))

In [None]:
loss_list = [7192,7149,7139,7133,7130,7128,7127,7126,7126,7126,7126,7126,7126,7126]
rmse_list = [2226,2039,1991,1964,1947,1940,1934,1930,1928,1927,1926,1926,1926,1926]
U_mean_dist_list = [0.007051741238683462,0.010280357673764229,0.012986190617084503,0.016543250530958176,0.016660312190651894,0.018504280596971512,0.0225063469260931,0.024717843160033226,0.028801996260881424,0.026518404483795166,0.028305459767580032,0.02960900403559208,0.02976769208908081,0.030901530757546425]
I_mean_dist_list = [0.004827531985938549,0.010112187825143337,0.015812134370207787,0.01486632227897644,0.023593515157699585,0.036251962184906006,0.03952261433005333,0.05163084343075752,0.061826325953006744,0.07495086640119553,0.07987217605113983,0.08609910309314728,0.08502209931612015,0.09476266801357269]
import matplotlib.pyplot as plt

plt.title("RMSE, Loss, and embeddings ave distances of neigbors over training epochs")
plt.plot(range(len(loss_list)), loss_list, label='loss',c ="orange")
plt.twinx()
plt.plot(range(len(rmse_list)), rmse_list, label='rmse',c='b')
plt.twinx()
plt.plot(range(len(U_mean_dist_list)), U_mean_dist_list, label='U',c='red')
plt.twinx()
plt.plot(range(len(I_mean_dist_list)), I_mean_dist_list, label='I',c='yellow')

plt.legend()


In [None]:
!pip install tensorboard

In [59]:
from tensorboardX import SummaryWriter
import tensorboardX
%load_ext tensorboard

%tensorboard --logdir /home/arthurb/Programmation/liriscat/scripts/logs

# Connect to 

Non-negative Matrix Factorization

In [4]:
from sklearn.decomposition import NMF
from DBPR.model import abstract_model as am 
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy.sparse import csr_matrix

metrics = []

for i_fold in range(5) :  
    
    # read datasets
    train_triplets = pd.read_csv(f'../datasets/{dataset_name}/train_triples_vert_{i_fold}.csv', encoding='utf-8')
    valid_triplets = pd.read_csv(f'../datasets/{dataset_name}/valid_triples_vert_{i_fold}.csv', encoding='utf-8')
    test_triplets = pd.read_csv(f'../datasets/{dataset_name}/test_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False)

    train_valid_triplets = pd.concat([train_triplets, valid_triplets]).to_records(index=False)   
       
    train_valid_data = dataset.LoaderDataset(train_valid_triplets, concept_map, metadata)
    test_data = dataset.LoaderDataset(test_triplets, concept_map,metadata)
    # define model here

    for seed in range(1) : 
        algo = NMF(n_components=metadata["num_dimension_id"], init='random',max_iter=10000, random_state=seed)
        
        # Set the seed
        utils.set_seed(seed)
        imp = IterativeImputer(max_iter=20, random_state=seed, missing_values=0.0,min_value=1.0, max_value=float(train_valid_data.log_tensor.max())) 
        
        # train algo ----
        # Impute missing data
        X_train = csr_matrix(train_valid_data.log_tensor.numpy())
        X_train = train_valid_data.log_tensor.numpy()
        imp.fit(X_train)
        X_train = imp.transform(X_train)
        algo.fit(X_train)
        
        # test algo ----
        X_test = test_data.log_tensor.numpy()
        X_test = imp.transform(X_test)
        users_emb = algo.transform(X_test)
        items_emb = algo.components_
        
        y_true = test_data.log_tensor.reshape(-1).numpy()
        y_pred = (users_emb @ items_emb).reshape(-1)
        metrics.append(am.root_mean_squared_error(y_pred[y_true !=0],y_true[y_true !=0]))

[0.20041929]
[0.20041929, 0.20112045]
[0.20041929, 0.20112045, 0.20127916]


KeyboardInterrupt: 

In [22]:
print('rmse : {:.4f} +- {:.4f}'.format(np.mean(metrics),np.std(metrics)))

rmse : 0.2017 +- 0.0010


SVD

In [25]:
from sklearn.decomposition import TruncatedSVD
from DBPR.model import abstract_model as am 
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

metrics = []

for i_fold in range(5) :  
    
    # read datasets
    train_triplets = pd.read_csv(f'../datasets/{dataset_name}/train_triples_vert_{i_fold}.csv', encoding='utf-8')
    valid_triplets = pd.read_csv(f'../datasets/{dataset_name}/valid_triples_vert_{i_fold}.csv', encoding='utf-8')
    test_triplets = pd.read_csv(f'../datasets/{dataset_name}/test_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False)

    train_valid_triplets = pd.concat([train_triplets, valid_triplets]).to_records(index=False)   
       
    train_valid_data = dataset.LoaderDataset(train_valid_triplets, concept_map, metadata)
    test_data = dataset.LoaderDataset(test_triplets, concept_map,metadata)
    # define algo here

    for seed in range(1) : 
        algo = TruncatedSVD(n_components=metadata["num_dimension_id"], n_iter=10000, random_state=seed)
        
        # Set the seed
        utils.set_seed(seed)
        imp = IterativeImputer(max_iter=20, random_state=seed, missing_values=0.0,min_value=1.0, max_value=float(train_valid_data.log_tensor.max())) # todo : attention min value is 1
        
        # Impute missing data
        #X_train = csr_matrix(train_valid_data.log_tensor.numpy())
        X_train = train_valid_data.log_tensor.numpy()
        imp.fit(X_train)
        X_train = imp.transform(X_train)
        # train algo
        algo.fit(X_train)
        
        X_test = test_data.log_tensor.numpy()
        X_test = imp.transform(X_test)
        users_emb = algo.transform(X_test)
        items_emb = algo.components_
        
        y_true = test_data.log_tensor.reshape(-1).numpy()
        y_pred = (users_emb @ items_emb).reshape(-1)
        metrics.append(am.root_mean_squared_error(y_pred[y_true !=0],y_true[y_true !=0]))

In [26]:
print('rmse : {:.4f} +- {:.4f}'.format(np.mean(metrics),np.std(metrics)))

rmse : 2.426 +- 0.008


Average response prediction

In [73]:
# read datasets
algo = model.Averaging(**config)
metrics = []
    
for i_fold in range(5) : 
                   
    train_triplets = pd.read_csv(f'../datasets/{dataset_name}/train_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
    valid_triplets = pd.read_csv(f'../datasets/{dataset_name}/valid_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
    test_triplets = pd.read_csv(f'../datasets/{dataset_name}/test_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
    
    
    train_data = dataset.LoaderDataset(train_triplets, concept_map, metadata)
    valid_data = dataset.LoaderDataset(valid_triplets, concept_map, metadata) # todo : unused for the moment
    test_data = dataset.LoaderDataset(test_triplets, concept_map,metadata) 
    # define model here

    for seed in range(1) : 
        # Set the seed
        utils.set_seed(seed)
        
        # train model
        algo.init_model(train_data, None)
        metrics.append(algo.evaluate_test(test_data))

In [74]:
df = pd.DataFrame(metrics)
print('rmse : {:.4f} +- {:.4f}'.format(df['rmse'].mean(),df['rmse'].std()))

rmse : 0.4126 +- 0.0004


# Neural Net

In [25]:
# read datasets
from importlib import reload

metrics = []
embeddings = []
config['learning_rate'] = 0.001

for i_fold in range(5) : 
    algo = model.NN(**config)
                   
    train_triplets = pd.read_csv(f'../datasets/{dataset_name}/train_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
    valid_triplets = pd.read_csv(f'../datasets/{dataset_name}/valid_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
    test_triplets = pd.read_csv(f'../datasets/{dataset_name}/test_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
    
    
    train_data = dataset.LoaderDataset(train_triplets, concept_map, metadata)
    valid_data = dataset.LoaderDataset(valid_triplets, concept_map, metadata) # todo : unused for the moment
    test_data = dataset.LoaderDataset(test_triplets, concept_map,metadata) 
    # define model here

    for seed in range(1) : 
        # Set the seed
        utils.set_seed(seed)
        
        # train model
        embeddings.append(algo.get_user_emb())

        algo.init_model(train_data, None)
        algo.train(train_data, valid_data)
        
        
        metrics.append(algo.evaluate_test(test_data))
        
df = pd.DataFrame(metrics)
print(f'lr : {lr}; num_layers : {num_layers}')
print('rmse : {:.4f} +- {:.4f}'.format(df['rmse'].mean(),df['rmse'].std()))

AttributeError: 'NoneType' object has no attribute 'users_emb'

In [43]:
df = pd.DataFrame(metrics)
print(f'lr : {lr}; num_layers : {num_layers}')
print('rmse : {:.4f} +- {:.4f}'.format(df['rmse'].mean(),df['rmse'].std()))

lr : 0.01; num_layers : 1
rmse : 2.521 +- nan


## DotProduct

In [9]:
from importlib import reload
reload(DBPR.utils)
reload(DBPR.model)
# read datasets

metrics = []

for i_fold in range(5):
    config['learning_rate'] = 0.001
    algo = model.DotProduct(**config)

    train_triplets = pd.read_csv(f'../datasets/{dataset_name}/train_triples_vert_{i_fold}.csv',
                                 encoding='utf-8').to_records(index=False,
                                                              column_dtypes={'student_id': int, 'item_id': int,
                                                                             "correct": float})
    valid_triplets = pd.read_csv(f'../datasets/{dataset_name}/valid_triples_vert_{i_fold}.csv',
                                 encoding='utf-8').to_records(index=False,
                                                              column_dtypes={'student_id': int, 'item_id': int,
                                                                             "correct": float})
    test_triplets = pd.read_csv(f'../datasets/{dataset_name}/test_triples_vert_{i_fold}.csv',
                                encoding='utf-8').to_records(index=False,
                                                             column_dtypes={'student_id': int, 'item_id': int,
                                                                            "correct": float})

    train_data = dataset.LoaderDataset(train_triplets, concept_map, metadata)
    valid_data = dataset.LoaderDataset(valid_triplets, concept_map, metadata)  # todo : unused for the moment
    test_data = dataset.LoaderDataset(test_triplets, concept_map, metadata)
    # define model here

    for seed in range(1):
        # Set the seed
        utils.set_seed(seed)

        # train model
        algo.init_model(train_data, None)
        algo.train(train_data, valid_data)
        metrics.append(algo.evaluate_test(test_data))


df = pd.DataFrame(metrics)
print('rmse : {:.4f} +- {:.4f}'.format(df['rmse'].mean(), df['rmse'].std()))

[INFO 13:21] train on cpu
[INFO 13:21] -- START Training --


100%|██████████| 76/76 [00:01<00:00, 51.94it/s]
100%|██████████| 76/76 [00:01<00:00, 53.45it/s]
100%|██████████| 76/76 [00:01<00:00, 56.47it/s]


[INFO 13:26] Epoch [2] 
- Losses : train=nan, valid=nan, best_valid=9223372036854775808.0000 
- RMSE   :       -       valid=nan,  valid_4b_loss=9223372036854775808.0000,


100%|██████████| 76/76 [00:01<00:00, 58.39it/s]
100%|██████████| 76/76 [00:01<00:00, 48.05it/s]
100%|██████████| 76/76 [00:01<00:00, 47.94it/s]


[INFO 13:30] Epoch [5] 
- Losses : train=nan, valid=nan, best_valid=9223372036854775808.0000 
- RMSE   :       -       valid=nan,  valid_4b_loss=9223372036854775808.0000,


100%|██████████| 76/76 [00:01<00:00, 47.52it/s]
 14%|█▍        | 11/76 [00:00<00:01, 42.20it/s]


KeyboardInterrupt: 

### Binary CDMs experiments

In [None]:
# grid_search
for lr in [0.010]:
    config['learning_rate'] = lr
    for num_layers in [0]:
        config['num_layers'] = num_layers
        # read datasets
        algo = model.GCCD(**config)
        metrics = []
            
        for i_fold in range(5) : 
                           
            train_triplets = pd.read_csv(f'../datasets/{dataset_name}/train_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
            valid_triplets = pd.read_csv(f'../datasets/{dataset_name}/valid_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
            test_triplets = pd.read_csv(f'../datasets/{dataset_name}/test_triples_vert_{i_fold}.csv', encoding='utf-8').to_records(index=False,column_dtypes={'student_id': int, 'item_id': int, "correct": float})
            
            
            train_data = dataset.LoaderDataset(train_triplets, concept_map, metadata)
            valid_data = dataset.LoaderDataset(valid_triplets, concept_map, metadata)
            test_data = dataset.LoaderDataset(test_triplets, concept_map,metadata) 
            
            
            # Data transformation for MIRT
            # Transform data to torch Dataloader (i.e., batchify)

            
            import torch
            from torch.utils.data import TensorDataset, DataLoader
            
            def transform(x, y, z, batch_size, **params):
                dataset = TensorDataset(
                    torch.tensor(x, dtype=torch.int64),
                    torch.tensor(y, dtype=torch.int64),
                    torch.tensor(z, dtype=torch.float)
                )
                return DataLoader(dataset, batch_size=batch_size, **params)
            
            train_data, valid_data, test_data = [
                transform(data["student_id"], data["item_id"], data["correct_binary"],config['batch_size'] )
                for data in [train_data, valid_data, test_data]
            ]

            # define model here
        
            for seed in range(1) : 
                # Set the seed
                utils.set_seed(seed)
                
                # train model
                cdm = MIRT(4164, 17747, 123)
                cdm.train(train, valid, epoch=2)
                cdm.save("mirt.params")

                algo.init_model(train_data, None)
                algo.train(train_data, valid_data)
                metrics.append(algo.evaluate_test(test_data))
                
        df = pd.DataFrame(metrics)
        print(f'lr : {lr}; num_layers : {num_layers}')
        print('rmse : {:.4f} +- {:.4f}'.format(df['rmse'].mean(),df['rmse'].std()))