# CDM pretraining
### Import

## Train IMPACT model

In [None]:
from IMPACT.utils import generate_eval_config
from IMPACT.dataset import LoaderDataset as IMPACT_dataset
from IMPACT import model
from micat.dataset import preprocessing_utilities as pu

In [None]:
folds_nb = 1
dataset_name="math2"

# Set all the required parameters ---------------
IMPACT_config = generate_eval_config(save_params=True, dataset_name=dataset_name, embs_path="../embs/", params_path="../ckpt/",  learning_rate=0.016848380924625605, lambda_=9.972254466547545e-06, batch_size=2048,num_epochs=200,valid_metric='mi_acc', pred_metrics=["mi_acc"],profile_metrics=['doa'])

concept_map, metadata, nb_modalities = pu.load_dataset_resources(IMPACT_config)

In [None]:
for seed in range(3):
    IMPACT_config['seed'] = seed
    for i_fold in range(folds_nb):

        IMPACT_config['i_fold'] = i_fold
        vertical_train, vertical_valid = pu.vertical_data(IMPACT_config, i_fold)

        impact_train_data = IMPACT_dataset(vertical_train, concept_map, metadata, nb_modalities)
        impact_valid_data = IMPACT_dataset(vertical_valid, concept_map, metadata, nb_modalities)

        algo = model.IMPACT(**IMPACT_config)
        algo.init_model(impact_train_data, impact_valid_data)
        algo.train(impact_train_data, impact_valid_data)
        print(algo.evaluate_predictions(impact_valid_data))
        print(algo.evaluate_predictions(impact_train_data))

In [None]:
print(algo.evaluate_profiles(impact_valid_data))

## Train NCDM Model

In [None]:
%load_ext autoreload
%autoreload 2

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["PYTHONHASHSEED"] = "0"
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import micat
micat.utils.set_seed(0)

import logging
import gc
import json
import torch
micat.utils.set_seed(0)
import pandas as pd

In [None]:
from IMPACT.utils import generate_eval_config
from IMPACT.dataset import LoaderDataset as IMPACT_dataset
from micat.CDM.NCDM import CATNCDM
from micat.dataset import preprocessing_utilities as pu

In [None]:
folds_nb = 1
dataset_name="math2"

# Set all the required parameters ---------------
IMPACT_config = generate_eval_config(save_params=False, load_params=True, patience=5,num_epochs=5, dataset_name=dataset_name, embs_path="../embs/", params_path="../ckpt/",  learning_rate=0.016848380924625605, lambda_=9.972254466547545e-06, batch_size=2048,valid_metric='mi_acc', pred_metrics=["mi_acc"],profile_metrics=['doa'])

concept_map, metadata, nb_modalities = pu.load_dataset_resources(IMPACT_config)


In [None]:
for i_fold in range(5):

    IMPACT_config['i_fold'] = i_fold
    vertical_train, vertical_valid = pu.vertical_data(IMPACT_config, i_fold)

    train_set = IMPACT_dataset(vertical_train, concept_map, metadata, nb_modalities)
    valid_set = IMPACT_dataset(vertical_valid, concept_map, metadata, nb_modalities)

    cdm = CATNCDM(**IMPACT_config)
    cdm.init_CDM_model(train_set,valid_set)
    #cdm.train(train_set, valid_set, epoch=IMPACT_config['num_epochs'])

    print(cdm.eval(valid_set))

In [None]:
IMPACT_config['n_query'] = 16
IMPACT_config['valid_batch_size'] = 2048

In [None]:
from micat.dataset import UserCollate, QueryEnv
test_df = pd.read_csv(
            f'../datasets/2-preprocessed_data/{IMPACT_config["dataset_name"]}_test_{i_fold}.csv',
            encoding='utf-8', dtype={'student_id': int, 'item_id': int, "correct": float,
                                                                    "dimension_id": int})


test_data = micat.dataset.EvalDataset(test_df, concept_map, metadata, IMPACT_config,nb_modalities)
test_data.split_query_meta(IMPACT_config['seed'])
test_query_env = QueryEnv(test_data, IMPACT_config['valid_batch_size'])

In [None]:
test_data._meta_mask[torch.tensor(list(test_data.users_id))]

In [None]:
cdm.train(train_set, valid_set, epoch=IMPACT_config['num_epochs'])

In [None]:
cdm = CATNCDM(**IMPACT_config)
cdm.init_CDM_model(train_set,valid_set)
cdm.train(train_set, valid_set, epoch=IMPACT_config['num_epochs'])

print(cdm.eval(valid_set))

In [None]:
IMPACT_config['load_params'] = True
cdm = CATNCDM(**IMPACT_config)
cdm.init_CDM_model(train_set,valid_set)
print(cdm.eval(valid_set))

In [None]:
dataset_name = "assist0910"
i_fold = 0

IMPACT_config = generate_eval_config(num_epochs=200, patience=30, save_params=True, dataset_name=dataset_name,
                                         embs_path="../embs/" + dataset_name, params_path="../ckpt/" + dataset_name,
                                         learning_rate=7.380681029927064e-05, lambda_=2.2656270501845414e-06, batch_size=2048,valid_metric='rmse', pred_metrics=["mi_acc", 'rmse'],profile_metrics=['doa'])

concept_map, metadata, nb_modalities = pu.load_dataset_resources(IMPACT_config)

IMPACT_config = convert_config_to_EduCAT(IMPACT_config, metadata)

IMPACT_config['i_fold'] = i_fold
vertical_train, vertical_valid = pu.vertical_data(IMPACT_config, i_fold)

impact_train_data = IMPACT_dataset(vertical_train, concept_map, metadata, nb_modalities)
impact_valid_data = IMPACT_dataset(vertical_valid, concept_map, metadata, nb_modalities)


IMPACT_config['load_params'] = True
IMPACT_config['save'] = False
cdm = NCDM(metadata['num_dimension_id'], metadata['num_item_id'], metadata['num_user_id'], IMPACT_config)
print(cdm.eval(valid_data))

In [None]:
train_data

In [None]:
for seed in range(1,2):
    IMPACT_config['seed'] = seed
    for i_fold in range(folds_nb):

        IMPACT_config['i_fold'] = i_fold
        vertical_train, vertical_valid = pu.vertical_data(IMPACT_config, i_fold)

        impact_train_data = IMPACT_dataset(vertical_train, concept_map, metadata, nb_modalities)
        impact_valid_data = IMPACT_dataset(vertical_valid, concept_map, metadata, nb_modalities)

        train_set, valid_set = [
            pu.transform(data.raw_data_array[:,0].long(), data.raw_data_array[:,1].long(), concept_map, data.raw_data_array[:,2], IMPACT_config['batch_size'], impact_train_data.n_categories)
            for data in [impact_train_data, impact_valid_data]
        ]

In [None]:
IMPACT_config['load_params'] = True
cdm = NCDM(metadata['num_dimension_id'], metadata['num_item_id'], metadata['num_user_id'], IMPACT_config)
print(cdm.eval(valid_set))
IMPACT_config['load_params'] = False

In [None]:
IMPACT_config