# GCMC Cornac experiments
### 1. Init
#### 1.1. Import libraries

In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import sys
sys.path.append("../../")


import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
from pathlib import Path
sys.path.insert(0, str((Path.cwd() / "../datasets/external_packages").resolve()))

from cornac_util import test
import numpy as np
import random
from datetime import datetime

import json
import torch
import sys

import logging
from cornac.models import GCMC

#### 1.2. Set up the loggers

In [None]:
def setuplogger(verbose: bool = True, log_path: str = "../../experiments/logs/", log_name: str = None):
    root = logging.getLogger()
    if verbose:
        root.setLevel(logging.INFO)
    else:
        root.setLevel(logging.ERROR)

    # Stream handler for console output
    stream_handler = logging.StreamHandler(sys.stdout)
    if verbose:
        stream_handler.setLevel(logging.INFO)
    else:
        stream_handler.setLevel(logging.ERROR)
    formatter = logging.Formatter("[%(levelname)s %(asctime)s] %(message)s")
    formatter.default_time_format = "%M:%S"
    formatter.default_msec_format = ""
    stream_handler.setFormatter(formatter)

    # Remove existing handlers
    for handler in root.handlers[:]:
        root.removeHandler(handler)

    if log_name is not None:
        now = datetime.now()
        time_str = now.strftime("_%d:%m:%y_%H:%M:%S")
        file_handler = logging.FileHandler(log_path + log_name + time_str + ".log")

        if verbose:
            file_handler.setLevel(logging.INFO)
        else:
            file_handler.setLevel(logging.ERROR)
        file_handler.setFormatter(formatter)
        root.addHandler(file_handler)

    # Add new handlers
    root.addHandler(stream_handler)

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    else:
        print("CUDA is not available. Skipping CUDA seed setting.")

In [20]:
setuplogger(verbose = True, log_name="GCMC_cornac")

#### 1.4. Parametrize the datasets

In [21]:
# choose dataset here
dataset_name = 'postcovid'
version= ""#"_small"
# modify config here

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

config = {

    # General params
    'seed' : 0,

    # Saving params
    'load_params': False,
    'save_params': False,
    'embs_path' : '../embs/'+str(dataset_name),
    'params_path' :'../ckpt/'+str(dataset_name),

    # training mode
    'early_stopping' : True,
    'fast_training' : True, # (Only taken in account if early_stopping == true) If true, doesn't compute valid rmse PC-ER

    # Learning params
    'learning_rate': 0.001,
    'batch_size': 2048,
    'num_epochs': 200,
    'num_dim': 10, # for IRT or MIRT todo : is it necessary as we use concepts knowledge number as embedding dimension ?
    'eval_freq' : 1,
    'patience' : 30,
    'device': device,
    'lambda' : 7.7e-6,
    'tensorboard': False,
    'flush_freq' : True,

    # for NeuralCD
    'prednet_len1': 128,
    'prednet_len2': 64,
    'best_params_path':'',

    #For GCCD
    'num_layers': 0,
    'version': 'pair',
    'p_dropout': 0,
    'low_mem_mode' : True,
    'user_nbrs_n' : 10,
    'item_nbrs_n' : 5
}
concept_map = json.load(open(f'../datasets/{dataset_name}/concept_map.json', 'r'))
concept_map = {int(k):[int(x) for x in v] for k,v in concept_map.items()}
metadata = json.load(open(f'../datasets/{dataset_name}/metadata.json', 'r'))
set_seed(config['seed'])
dataset_name += version
logging.info(f'#### {dataset_name} ####')
logging.info(f'#### config : {config} ####')

CUDA is available. Using GPU.
[INFO 54:40] #### postcovid ####
[INFO 54:40] #### config : {'seed': 0, 'load_params': False, 'save_params': False, 'embs_path': '../embs/postcovid', 'params_path': '../ckpt/postcovid', 'early_stopping': True, 'fast_training': True, 'learning_rate': 0.001, 'batch_size': 2048, 'num_epochs': 200, 'num_dim': 10, 'eval_freq': 1, 'patience': 30, 'device': device(type='cuda'), 'lambda': 7.7e-06, 'tensorboard': False, 'flush_freq': True, 'prednet_len1': 128, 'prednet_len2': 64, 'best_params_path': '', 'num_layers': 0, 'version': 'pair', 'p_dropout': 0, 'low_mem_mode': True, 'user_nbrs_n': 10, 'item_nbrs_n': 5} ####


In [25]:
def generate_GCMC(config,metadata = None) :
    return GCMC(
        max_iter=config['num_epochs'],
        learning_rate=config['learning_rate'],
        gcn_out_units=metadata['num_dimension_id'],
        optimizer='adam',
        gcn_agg_accum="sum",
        activation_func=config['activation_func'],
        gcn_agg_units=500,
        train_valid_interval=config['eval_freq'],
        train_early_stopping_patience=config['patience'],
        trainable=True,
        seed=config['seed'],
        verbose=False
    )

### 2. CDM Training


In [23]:
seed = 0
set_seed(0)

config['seed'] = seed
config['early_stopping'] = True
config['esc'] = 'objectives' #'loss' 'delta_objectives'
config['num_epochs']=200
config['eval_freq']=1
config['patience']=30

config['verbose_early_stopping'] = False
config["tensorboard"] = False
config['flush_freq'] = False
config['save_params']= False
config['disable_tqdm'] = True

In [None]:
reload(cornac_util)

dataset_name = "postcovid"
eval_method,concept_map,metadata = cornac_util.load_dataset(dataset_name,config, 0)

study = optuna.create_study(
    directions=["minimize"],  # Specify directions for each objective
)
gc.collect()
torch.cuda.empty_cache()
study.optimize(lambda trial: cornac_util.objective_GCMC(trial, config, eval_method,generate_GCMC), n_trials=100, timeout=1800, n_jobs=4, gc_after_trial=True)

# Analyze the results
## requirements : plotly, nbformat
pareto_trials = study.best_trials
logging.info(f"Best trial for {dataset_name} : {study.best_trials}") 

logging.info("Number of trials :"+str(len(study.trials)))
for trial in study.trials:
    logging.info(f"Trial #{trial.number}")
    logging.info(f"  RMSE: {trial.values}")
    #logging.info(f"  DOA: {trial.values[1]}")
    logging.info(f"  Params: {trial.params}")

dataset_name = "promis"
eval_method,concept_map,metadata = cornac_util.load_dataset(dataset_name,config, 0)

study = optuna.create_study(
    directions=["minimize"],  # Specify directions for each objective
)
gc.collect()
torch.cuda.empty_cache()
study.optimize(lambda trial: cornac_util.objective_GCMC(trial, config, eval_method,generate_GCMC), n_trials=100, timeout=3600, n_jobs=4, gc_after_trial=True)

# Analyze the results
## requirements : plotly, nbformat
logging.info(f"Best trial for {dataset_name} : {study.best_trials}") 

logging.info("Number of trials :"+str(len(study.trials)))
for trial in study.trials:
    logging.info(f"Trial #{trial.number}")
    logging.info(f"  RMSE: {trial.values}")
    #logging.info(f"  DOA: {trial.values[1]}")
    logging.info(f"  Params: {trial.params}")

dataset_name = "movielens"
eval_method,concept_map,metadata = cornac_util.load_dataset(dataset_name,config, 0)

study = optuna.create_study(
    directions=["minimize"],  # Specify directions for each objective
)
gc.collect()
torch.cuda.empty_cache()
study.optimize(lambda trial: cornac_util.objective_GCMC(trial, config, eval_method,generate_GCMC), n_trials=100, timeout=3600, n_jobs=4, gc_after_trial=True)

# Analyze the results
## requirements : plotly, nbformat
logging.info(f"Best trial for {dataset_name} : {study.best_trials}") 

logging.info("Number of trials :"+str(len(study.trials)))
for trial in study.trials:
    logging.info(f"Trial #{trial.number}")
    logging.info(f"  RMSE: {trial.values}")
    #logging.info(f"  DOA: {trial.values[1]}")
    logging.info(f"  Params: {trial.params}")
    
dataset_name = "portrait"
eval_method,concept_map,metadata = cornac_util.load_dataset(dataset_name,config, 0)

study = optuna.create_study(
    directions=["minimize"],  # Specify directions for each objective
)
gc.collect()
torch.cuda.empty_cache()
study.optimize(lambda trial: cornac_util.objective_GCMC(trial, config, eval_method,generate_GCMC), n_trials=100, timeout=3600, n_jobs=4, gc_after_trial=True)

# Analyze the results
## requirements : plotly, nbformat
logging.info(f"Best trial for {dataset_name} : {study.best_trials}") 

logging.info("Number of trials :"+str(len(study.trials)))
for trial in study.trials:
    logging.info(f"Trial #{trial.number}")
    logging.info(f"  RMSE: {trial.values}")
    #logging.info(f"  DOA: {trial.values[1]}")
    logging.info(f"  Params: {trial.params}")


### 3. CDM Prediction
#### 3.1. Training and testing

In [13]:
def find_emb(algo):
    enc_graph = algo.model.train_enc_graph
    emb, item_out =  algo.model.net.encoder(enc_graph)
    return emb.detach().cpu().numpy()

In [None]:

dataset_name = "postcovid"
logging.info(dataset_name)
config['learning_rate'] = 0.01415
config['activation_func'] = 'tanh'
metrics = test(dataset_name,config,generate_GCMC,find_emb)

# dataset_name = "promis"
# logging.info(dataset_name)
# config['learning_rate'] = 0.02032
# config['activation_func'] = 'relu'
# metrics = cornac_util.test(dataset_name,config,generate_GCMC,find_emb)
#
# dataset_name = "movielens"
# logging.info(dataset_name)
# config['learning_rate'] = 0.01892
# config['activation_func'] = 'tanh'
# metrics = cornac_util.test(dataset_name,config,generate_GCMC,find_emb)
#
# dataset_name = "portrait"
# logging.info(dataset_name)
# config['learning_rate'] = 0.01541
# config['activation_func'] = 'tanh'
# metrics = cornac_util.test(dataset_name,config,generate_GCMC,find_emb)