# IMPACT paper experiments
### 1. Init
#### 1.1. Import libraries (necessary)

In [None]:
%load_ext autoreload
%autoreload 2

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from liriscat import utils
utils.set_seed(0)
from liriscat import dataset
from liriscat import selectionStrategy
from liriscat import CDM

import logging
import gc
import json
import torch
import pandas as pd
from importlib import reload
import IMPACT

#### 1.2. Set up the loggers (recommended)

In [None]:
utils.setuplogger(verbose = True, log_name="liriscat")

### 2. CDM prediction
#### 2.1. Training and testing, sequential version

In [None]:
import warnings
import numpy as np

gc.collect()
torch.cuda.empty_cache()

reload(utils)
reload(selectionStrategy)
reload(CDM)
reload(dataset)

In [None]:
config = utils.generate_eval_config(esc = 'error', valid_metric= 'mi_acc', pred_metrics = ["mi_acc"], profile_metrics = ['doa'], save_params=False, n_query=4, num_epochs=4, batch_size=512)
utils.set_seed(config["seed"])

config["dataset_name"] = "math2"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.02026
config['lambda'] = 1.2e-5
config['d_in'] = 4
config['num_responses'] = 12
#pred_metrics,df_interp = test(config)

In [None]:
logging.info(f'#### {config["dataset_name"]} ####')
logging.info(f'#### config : {config} ####')
config['embs_path']='../embs/'+str(config["dataset_name"])
config['params_path']='../ckpt/'+str(config["dataset_name"])

pred_metrics = {m:[] for m in config['pred_metrics']}
profile_metrics = {m:[] for m in config['profile_metrics']}

gc.collect()
torch.cuda.empty_cache()

# Dataset downloading for doa and rm
warnings.filterwarnings("ignore", message="invalid value encountered in divide")
warnings.filterwarnings("ignore", category=RuntimeWarning)

## Concept map format : {question_id : [category_id1, category_id2, ...]}
concept_map = json.load(open(f'../datasets/2-preprocessed_data/{config["dataset_name"]}_concept_map.json', 'r'))
concept_map = {int(k): [int(x) for x in v] for k, v in concept_map.items()}

## Metadata map format : {"num_user_id": ..., "num_item_id": ..., "num_dimension_id": ...}
metadata = json.load(open(f'../datasets/2-preprocessed_data/{config["dataset_name"]}_metadata.json', 'r'))

In [None]:
i_fold = 0
## Dataframe columns : (user_id, question_id, response, category_id)
train_df = pd.read_csv(
    f'../datasets/2-preprocessed_data/{config["dataset_name"]}_train_{i_fold}.csv',
    encoding='utf-8', dtype={'student_id': int, 'item_id': int, "correct": float,
                                                             "dimension_id": int})
valid_df = pd.read_csv(
    f'../datasets/2-preprocessed_data/{config["dataset_name"]}_valid_{i_fold}.csv',
    encoding='utf-8', dtype={'student_id': int, 'item_id': int, "correct": float,
                                                             "dimension_id": int})
test_df = pd.read_csv(
    f'../datasets/2-preprocessed_data/{config["dataset_name"]}_test_{i_fold}.csv',
    encoding='utf-8', dtype={'student_id': int, 'item_id': int, "correct": float,
                                                             "dimension_id": int})

In [None]:
reload(dataset)
train_data = dataset.CATDataset(train_df, concept_map, metadata, config)
valid_data = dataset.EvalDataset(valid_df, concept_map, metadata, config)
test_data = dataset.EvalDataset(test_df, concept_map, metadata, config)

In [None]:
reload(utils)
reload(selectionStrategy)
reload(dataset)
reload(CDM)

S = selectionStrategy.Random(**config)


In [None]:

S.train(train_data, valid_data)

In [None]:
data_loader = torch.utils.data.DataLoader(impact_valid_data, batch_size=1, shuffle=False)

In [None]:
U_resp_sum = torch.zeros(size=(S.CDM.model.user_n, S.CDM.model.concept_n)).to(S.device, non_blocking=True)
U_resp_nb = torch.zeros(size=(S.CDM.model.user_n, S.CDM.model.concept_n)).to(S.device, non_blocking=True)

S.CDM.model.eval()
with torch.no_grad(), torch.amp.autocast('cuda'):
    data_loader = torch.utils.data.DataLoader(impact_valid_data, batch_size=1, shuffle=False)
    for data_batch in data_loader:
        user_ids = data_batch[:, 0].long()
        item_ids = data_batch[:, 1].long()
        labels = data_batch[:, 2]
        dim_ids = data_batch[:, 3].long()

        U_resp_sum[user_ids, dim_ids] += labels
        U_resp_nb[user_ids, dim_ids] += torch.ones_like(labels)

In [None]:
S.CDM.init_model(impact_train_data, impact_valid_data)
S.CDM.model.to(S.device, non_blocking=True)
S.CDM.train(train_data, valid_data)


In [None]:
test_data.split_query_meta(2)
S.evaluate_test(test_data)

In [None]:
saved_mi_acc = [0.643, 0.6379999999999999, 0.645, 0.661, 0.6679999999999999, 0.675, 0.6890000000000001, 0.7010000000000001]

saved_doa = [0.506,0.564,0.576,0.588,0.604,0.612,0.685,0.702]

In [None]:
import matplotlib.pyplot as plt

fig, ax1 = plt.subplots()

# First plot
color1 = 'b'
ln1 = ax1.plot(saved_mi_acc, color1, label='mi_acc')
ax1.set_xlabel('Number of submitted questions')
ax1.set_ylabel('Micro averaged accuracy', color=color1)
ax1.tick_params(axis='y', labelcolor=color1)

# Twin axis for second plot
ax2 = ax1.twinx()
color2 = 'r'
ln2 = ax2.plot(saved_doa, color2, label='doa')
ax2.set_ylabel('DOA', color=color2)
ax2.tick_params(axis='y', labelcolor=color2)

# Combine legends from both axes
lns = ln1 + ln2
labels = [l.get_label() for l in lns]
ax1.legend(lns, labels, loc='best')

plt.title('ACC et DOA of IMPACT on the Meta test set\nover the number of Randomly submitted questions')
plt.show()
