# Start Here

In [1]:
# static path variables
paths = {
    "ASSIST09" : "data/ASSISTments2009/",
    "NEUR20" : "data/NeurIPS2020/",
}
wranglers = {
    "ASSIST09" : "data/ASSISTments2009/assistments09_wrangler", 
    "NEUR20" : "data/NeurIPS2020/neurIPS2020_wrangler",
}
models = { #might not need this
    "IRT" : "models/IRT/IRT",
    "NCDM": "models/NCDM/NCDM"
}
prepper_path = "data/dataPrepper"

In [2]:
import os

def checkExists(path):
    #remove the .py file if it exists
    if(os.path.exists(path)):
        print("found " + path)
        return True
    else:
        print("couldn't find " + path)
        print(f"Try running: !jupyter nbconvert --to script {path[:-2] + 'ipynb'}")
        return False

In [3]:
# A temporary measure while working in notebooks
# Ensure all necesary scripts have been converted from notebooks
allExists = True

allExists = checkExists(prepper_path + ".py") and allExists

for key in models.keys():
    allExists = checkExists(models[key] + ".py") and allExists

for key in wranglers.keys():
    allExists = checkExists(wranglers[key] + ".py") and allExists

if(not allExists):
    print("\nMissing critical .py file(s), see above")

found data/dataPrepper.py
found models/IRT/IRT.py
found models/NCDM/NCDM.py
found data/ASSISTments2009/assistments09_wrangler.py
found data/NeurIPS2020/neurIPS2020_wrangler.py


In [4]:
#!jupyter nbconvert --to script data/dataPrepper.ipynb

In [5]:
modelList ={
    "1":"IRT",
    "2":"NCDM"
}

datasetList = {
    "1":"ASSIST09",
    "2":"NEUR20",
}

runTypeList = {
    "1":"basic",
    "2":"sampled",
    "3":"correctSaturated",
    "4":"incorrectSaturated",
}

model = modelList[input(f"Select a model to test:\n" + "\n".join([f'{k}.{modelList[k]}' for k in modelList.keys()]) + "\n")]
dataset = datasetList[input(f"Select a dataset to test on:\n" + "\n".join([f'{k}.{datasetList[k]}' for k in datasetList.keys()]) + "\n")]
runType = runTypeList[input(f"Select a run condition:\n" + "\n".join([f'{k}.{runTypeList[k]}' for k in runTypeList.keys()]) + "\n")]

print(f"Performing test run of {model} using {dataset} in {runType} format")

Select a model to test:
1.IRT
2.NCDM
 2
Select a dataset to test on:
1.ASSIST09
2.NEUR20
 1
Select a run condition:
1.basic
2.sampled
3.correctSaturated
4.incorrectSaturated
 1


Performing test run of NCDM using ASSIST09 in basic format


In [6]:
from data import dataPrepper as prep

Q, data = prep.prepareData(dataset = dataset, runType = runType, model = model)

Using existing split from ASSIST09 to train NCDM for basic format


Loading existing data:: 100%|████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.75it/s]


In [7]:
from models.IRT import IRT
from models.NCDM import NCDM
modelFuncs = {
    "IRT": IRT.run_IRT,
    "NCDM":NCDM.run_NCDM,
}

In [8]:
import numpy as np
import time
from datetime import datetime
from tqdm import tqdm

#find average training correct/incorrect ratio per student (should be ~0.50)
avg_train_score = []
avg_test_score = []
for d in data:
    avg_train_score.append(d["train"].groupby('user_id')["score"].mean().mean())
    avg_test_score.append(d["test"].groupby('user_id')["score"].mean().mean())

result_object = {
    "model" : model,
    "runType" : runType,
    "dataset" : dataset,
    "test_correct_ratio":np.mean(avg_test_score),
    "train_correct_ratio":np.mean(avg_train_score),
}

accs, aucs, maes, rmses, times = [], [], [], [], []

#run each provided data configuration and collect statistics

for run in data:
    start_timer = time.time()
    acc, auc, mae, rmse = modelFuncs[model](Q, run["train"], run["test"], run["valid"])
    end_timer = time.time()
    
    accs.append(acc)
    aucs.append(auc)
    maes.append(mae)
    rmses.append(rmse)
    times.append(end_timer - start_timer)


#format the output
result_object['ACC'] = np.mean(accs)
result_object['AUC'] = np.mean(aucs)
result_object['MAE'] = np.mean(maes)
result_object['RMSE'] = np.mean(rmses)
result_object['ACC_std'] = np.std(accs)
result_object['AUC_std'] = np.std(aucs)
result_object['MAE_std'] = np.std(maes)
result_object['RMSE_std'] = np.std(rmses)
result_object['avg_train_duration'] = np.mean(times)
result_object['performed_at'] = datetime.today().strftime('%Y-%m-%d %H:%M:%S')

NCDM detects 4286 users, 6650 items, and 381 knowledge concepts


KeyError: 2784

In [None]:
import pandas as pd

#print the output to a csv
results_path = "results.csv"

#if the csv doesn't exist create it
if not os.path.exists(results_path):
    df = pd.DataFrame([result_object])
else:
    #pull in the csv as a dataframe
    df = pd.read_csv(results_path)
    #remove any existing row that has the same...
        #model, runtype, and dataset
    df = df[~((df['model'] == model) & (df['runType'] == runType) & (df['dataset'] == dataset))]
    #add the new row
    df = pd.concat([df, pd.DataFrame(result_object, index=[1])], ignore_index=True)
df.to_csv(results_path, index=False)

In [None]:
df