# Math2 dataset preprocessing
### Import

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import pandas as pd
import numpy as np
from micat.dataset.preprocessing_utilities import *
from micat.utils import utils
import torch
utils.set_seed(0)

### Load and merge dataset

In [None]:
raw_data_train = pd.read_csv('../1-raw_data/Eedi-2_train.csv', encoding="ISO-8859-15", low_memory=False)

raw_data_test = pd.read_csv('../1-raw_data/Eedi-2_test.csv', encoding="ISO-8859-15", low_memory=False)
raw_data = pd.concat([raw_data_train, raw_data_test], ignore_index=True)

raw_data = raw_data.drop(columns=['CorrectAnswer','AnswerValue','IsTarget'])

raw_data.head()

### Split rows with several dimensions

In [None]:
#Pas utile car aucune colonne ne contient de listes de valeurs
df_expanded = raw_data.explode('AnswerId').reset_index(drop=True)
df_expanded.head()

### Early stats

In [None]:
print("Total number of data :", len(raw_data))

#Etapes inutiles car il n'y a pas de lignes Ã  supprimer
all_data = raw_data.dropna(subset=['QuestionId','UserId','AnswerId','IsCorrect'])
print(len(all_data))

In [None]:
stat_unique(all_data, None)
stat_unique(all_data, ['QuestionId','UserId'])
stat_unique(all_data, 'UserId')
stat_unique(all_data, 'QuestionId')
stat_unique(all_data, 'AnswerId')

### Clean Nan values

In [None]:


cleaned_data = all_data.dropna(subset=['QuestionId','UserId','AnswerId','IsCorrect'], axis='index')

#print(len(raw_data))
#print(len(cleaned_data))

### Remove duplicated user and id rows

In [None]:
unduplicated_data = remove_duplicates(cleaned_data, key_attrs=['QuestionId','UserId','AnswerId'], agg_attrs=[])

### Filter Data

In [None]:
filtered_data_0 = unduplicated_data
filtered_q_nb = 1
filtered_u_nb = 0
filtered_d_nb = 0
while filtered_q_nb!=0 or filtered_u_nb!=0 or filtered_d_nb!=0 : 
    # filter items
    filtered_data_1, filtered_q_nb = densify(filtered_data_0,'QuestionId','UserId',5)
    # filter users
    min_nb_users_logs = 20
    filtered_data_2, filtered_u_nb = densify(filtered_data_1,'UserId','QuestionId',min_nb_users_logs)
    # filter knowledges
    filtered_data_3, filtered_d_nb = densify(filtered_data_2,'AnswerId','QuestionId',1)
    filtered_data_0 = filtered_data_3
    
stat_unique(filtered_data_0, None)
stat_unique(filtered_data_0, ['UserId', 'QuestionId'])
stat_unique(filtered_data_0, 'UserId')
stat_unique(filtered_data_0, 'QuestionId')
stat_unique(filtered_data_0, 'AnswerId')

### Encoding

In [None]:
# renumber the users
u_enc_data, u2n = encode_attr(filtered_data_0, "user_id")

In [None]:
# renumber the items
q_enc_data, q2n = encode_attr(u_enc_data, "item_id")

In [None]:
# renumber the dimensions
d_enc_data, d2n = encode_attr(q_enc_data, "dimension_id")

### Maps creation

In [None]:
q2k, k2q = create_q2k(d_enc_data)

### Rescaling responses

In [None]:
resc_data = d_enc_data
resc_data['correct'] = resc_data['correct'].astype(float)
min_val = resc_data['correct'].min()
max_val = resc_data['correct'].max()
resc_data['correct'] = (resc_data['correct']-min_val)/(max_val-min_val) +1
print(f'min value : {resc_data["correct"].min()}, max value : {resc_data["correct"].max()}')

### Get final stats and create metadata

In [None]:
stat_unique(resc_data, None)
stat_unique(resc_data, ['user_id', 'item_id'])
stat_unique(resc_data, 'user_id')
stat_unique(resc_data, 'item_id')
stat_unique(resc_data, 'dimension_id')
stat_unique(resc_data, 'correct')

resc_data_dim_grouped_items = resc_data.groupby("dimension_id")["item_id"].nunique()
resc_data_dim_grouped_users = resc_data.groupby("dimension_id")["user_id"].nunique()
print('#questions/category: {} & {} &  {}'.format(np.min(resc_data_dim_grouped_items),int(np.round(np.mean(resc_data_dim_grouped_items))),np.max(resc_data_dim_grouped_items)))
print('#users/category: {} & {} &  {}'.format(np.min(resc_data_dim_grouped_users),int(np.round(np.mean(resc_data_dim_grouped_users))),np.max(resc_data_dim_grouped_users)))

resc_data_dim_grouped_items = resc_data.groupby("item_id")["dimension_id"].nunique()
resc_data_dim_grouped_users = resc_data.groupby("item_id")["user_id"].nunique()
print('#categorys/question: {} & {} &  {}'.format(np.min(resc_data_dim_grouped_items),int(np.round(np.mean(resc_data_dim_grouped_items))),np.max(resc_data_dim_grouped_items)))
print('#users/question: {} & {} &  {}'.format(np.min(resc_data_dim_grouped_users),int(np.round(np.mean(resc_data_dim_grouped_users))),np.max(resc_data_dim_grouped_users)))

resc_data_dim_grouped_items = resc_data.groupby("user_id")["item_id"].nunique()
resc_data_dim_grouped_users = resc_data.groupby("user_id")["dimension_id"].nunique()
print('#questions/users: {} & {} &  {}'.format(np.min(resc_data_dim_grouped_items),int(np.round(np.mean(resc_data_dim_grouped_items))),np.max(resc_data_dim_grouped_items)))
print('#categorys/users: {} & {} &  {}'.format(np.min(resc_data_dim_grouped_users),int(np.round(np.mean(resc_data_dim_grouped_users))),np.max(resc_data_dim_grouped_users)))

In [None]:
import matplotlib.pyplot as plt

# draw a histogram of the age column
(resc_data['correct']-1).hist(bins=20)

# add labels and title
plt.xlabel('Responses')
plt.ylabel('Number')
plt.title('Distribution of Responses')
plt.savefig('../4-figs/distribution_math2.png')


In [None]:
resc_data_dim_grouped_items = resc_data.groupby("item_id")["dimension_id"].nunique()
resc_data_dim_grouped_users = resc_data.groupby("item_id")["user_id"].nunique()
metadata = get_metadata(resc_data,["user_id", "item_id", "dimension_id"])
metadata['min_nb_users_logs'] = min_nb_users_logs
metadata['max_nb_categories_per_question'] = int(np.max(resc_data_dim_grouped_items))

### Compute nb of modalities per item

In [None]:
nb_modalities = get_modalities_nb(resc_data, metadata)

### Save data, concept map, metadata and nb_modalities

In [None]:
new_order = ['user_id','item_id', 'correct', 'dimension_id']
resc_data = resc_data[new_order]

In [None]:
# save selected data
resc_data.to_csv('../1-raw_data/math2_selected_data.csv', index=False)

In [None]:
# Convert int64 keys to str
q2k_str_keys = {str(key): value for key, value in q2k.items()}

# Convert sets to lists in the dictionary
q2k_json_serializable = {}
for key, value in q2k_str_keys.items():
    if isinstance(value, set):
        q2k_json_serializable[key] = list(value)
    else:
        q2k_json_serializable[key] = value

# Save concept map
with open('../2-preprocessed_data/math2_concept_map.json', 'w') as f:
    json.dump(q2k_json_serializable, f)

In [None]:
# save metadata
with open('../2-preprocessed_data/math2_metadata.json', 'w') as f:
    json.dump(metadata, f)

In [None]:
# save metadata
torch.save(nb_modalities,'../2-preprocessed_data/math2_nb_modalities.pkl')

## parse data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from micat import utils
utils.setuplogger(verbose = True,log_path = "../../logs/", log_name="micat")
utils.set_seed(0)

In [None]:
import pandas as pd
from micat.dataset.preprocessing_utilities import *
resc_data = pd.read_csv('../1-raw_data/math2_selected_data.csv')

In [None]:
train, valid, test = split_users(resc_data, 5)

## save data

In [None]:
for i_fold, train_fold in enumerate(train):
    save_df_to_csv(train[i_fold], f'../2-preprocessed_data/math2_train_{i_fold}.csv')
    save_df_to_csv(valid[i_fold], f'../2-preprocessed_data/math2_valid_{i_fold}.csv')
    save_df_to_csv(test[i_fold], f'../2-preprocessed_data/math2_test_{i_fold}.csv')

## Train IMPACT model

In [None]:
from micat.utils import generate_eval_config
import json
from IMPACT.dataset import LoaderDataset as IMPACT_dataset
from micat.CDM import *
from IMPACT import model
from micat.dataset.preprocessing_utilities import *


folds_nb = 1
dataset_name="math2"

# Set all the required parameters ---------------
IMPACT_config = generate_eval_config(num_epochs=200, save_params=True, dataset_name=dataset_name, embs_path="../../embs/"+dataset_name, params_path="../../ckpt/"+dataset_name,  learning_rate=0.01885, lambda_=2e-7, batch_size=2048,valid_metric='mi_acc', pred_metrics=["mi_acc"])

concept_map = json.load(open(f'../2-preprocessed_data/{IMPACT_config["dataset_name"]}_concept_map.json', 'r'))
concept_map = {int(k): [int(x) for x in v] for k, v in concept_map.items()}

metadata = json.load(open(f'../2-preprocessed_data/{IMPACT_config["dataset_name"]}_metadata.json', 'r'))


# Conversion helper that builds a DataFrame with specific dtypes and returns records.
def convert_to_records(data):
    df = pd.DataFrame(data, columns={'user_id': int, 'item_id': int, 'category_id': int,'correct': float, })
    return df.to_records(index=False, column_dtypes={'user_id': int, 'item_id': int, 'correct': float, 'category_id': int})

In [None]:
for i in range(folds_nb):
    # Merge the training and validation data for the current fold.
    train_valid_df = pd.concat([train[i], valid[i]])
    # Split the merged data horizontally into train and validation sets.

    quadruplet = quadruplet_format(train_valid_df)
    train_data, valid_data, test_data = split_data_vertically(quadruplet, test_prop=0.2,valid_prop=0.1,folds_nb=5)

    # Convert each split in one step using the helper.
    horizontal_train = convert_to_records(train_data[0])
    horizontal_valid = convert_to_records(valid_data[0])
    horizontal_test = convert_to_records(test_data[0])

    impact_train_data = IMPACT_dataset(horizontal_train, concept_map, metadata)
    impact_valid_data = IMPACT_dataset(horizontal_valid, concept_map, metadata)
    impact_test_data = IMPACT_dataset(horizontal_test, concept_map, metadata)

    IMPACT_config['i_fold'] = i
    algo = model.IMPACT(**IMPACT_config)
    algo.init_model(impact_train_data, impact_valid_data)
    algo.train(impact_train_data, impact_valid_data)
    print(algo.evaluate_predictions(impact_test_data))

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from micat import utils
utils.set_seed(0)
from micat import dataset
from micat import selectionStrategy
from micat import CDM

import logging
import gc
import json
import torch
import pandas as pd
from importlib import reload
import IMPACT

import warnings
import numpy as np

gc.collect()
torch.cuda.empty_cache()

reload(utils)
reload(selectionStrategy)
reload(CDM)
reload(dataset)

config = utils.generate_eval_config(i_fold = 0,
                                    num_epochs=1,
                                    load_params=True,
                                    inner_user_lr=0.0001,
                                    esc = 'error',
                                    valid_metric= 'mi_acc',
                                    pred_metrics = ["mi_acc"],
                                    profile_metrics = ['doa'],
                                    save_params=False,
                                    n_query=5,
                                    batch_size=512)
utils.set_seed(config["seed"])

config["dataset_name"] = "math2"
logging.info(config["dataset_name"])
config['learning_rate'] = 0.02026
config['lambda'] = 1.2e-5
config['d_in'] = 4
config['num_responses'] = 12
#pred_metrics,df_interp = test(config)

logging.info(f'#### {config["dataset_name"]} ####')
logging.info(f'#### config : {config} ####')
config['embs_path']='../../embs/'+str(config["dataset_name"])
config['params_path']='../../ckpt/'+str(config["dataset_name"])

pred_metrics = {m:[] for m in config['pred_metrics']}
profile_metrics = {m:[] for m in config['profile_metrics']}

gc.collect()
torch.cuda.empty_cache()

# Dataset downloading for doa and rm
warnings.filterwarnings("ignore", message="invalid value encountered in divide")
warnings.filterwarnings("ignore", category=RuntimeWarning)

## Concept map format : {question_id : [category_id1, category_id2, ...]}outside model initialization
concept_map = json.load(open(f'../2-preprocessed_data/{config["dataset_name"]}_concept_map.json', 'r'))
concept_map = {int(k): [int(x) for x in v] for k, v in concept_map.items()}

## Metadata map format : {"num_user_id": ..., "num_item_id": ..., "num_dimension_id": ...}
metadata = json.load(open(f'../2-preprocessed_data/{config["dataset_name"]}_metadata.json', 'r'))

i_fold = 0
## Dataframe columns : (user_id, question_id, response, category_id)
train_df = pd.read_csv(
    f'../2-preprocessed_data/{config["dataset_name"]}_train_{i_fold}.csv',
    encoding='utf-8', dtype={'student_id': int, 'item_id': int, "correct": float,
                                                             "dimension_id": int})
valid_df = pd.read_csv(
    f'../2-preprocessed_data/{config["dataset_name"]}_valid_{i_fold}.csv',
    encoding='utf-8', dtype={'student_id': int, 'item_id': int, "correct": float,
                                                             "dimension_id": int})
test_df = pd.read_csv(
    f'../2-preprocessed_data/{config["dataset_name"]}_test_{i_fold}.csv',
    encoding='utf-8', dtype={'student_id': int, 'item_id': int, "correct": float,
                                                             "dimension_id": int})
train_data = dataset.CATDataset(train_df, concept_map, metadata, config)
valid_data = dataset.EvalDataset(valid_df, concept_map, metadata, config)
test_data = dataset.EvalDataset(test_df, concept_map, metadata, config)

S = selectionStrategy.Random(metadata,**config)

In [None]:
S.init_models(train_data, valid_data)

In [None]:
S.evaluate_test(test_data)

In [None]:
torch.sum(S.CDM.model.R!=0)