In [1]:
import pandas as pd
from pprint import pprint
import stan
from scipy.special import expit
from matplotlib import pyplot as plt
from numpy.random import normal, randint, binomial, choice
from numpy import percentile, concatenate, array, linspace, append
import numpy as np 
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error,mean_absolute_error
import pickle
import nest_asyncio
nest_asyncio.apply()
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

In [2]:
neededData = pd.read_csv('./final_data/neededData.csv')
print(neededData.shape)
neededData = neededData.drop('Unnamed: 0', 1)
neededData.head()

(809694, 5)


Unnamed: 0,user_id,item_id,correct,nb_hint
0,72,563,0,0
1,249,563,1,0
2,251,563,1,0
3,214,563,1,0
4,155,563,0,0


In [6]:
needed = neededData[["user_id","item_id","correct"]]

In [7]:
train_data = {'I': len(needed['item_id'].unique()),
              'S': len(needed['user_id'].unique()),
              'N': len(needed),
              'item': needed['item_id'].to_numpy(),
              'subject': needed['user_id'].to_numpy(),
              'grade': needed['correct'].to_numpy(),}
pprint(train_data)

{'I': 1084,
 'N': 809694,
 'S': 574,
 'grade': array([0, 1, 1, ..., 1, 1, 1]),
 'item': array([563, 563, 563, ..., 482, 482, 482]),
 'subject': array([ 72, 249, 251, ..., 395, 395, 395])}


# 1PL Model

In [8]:
_1pl_model = """
data {
  // numbers of things
  
  int<lower=1> N;  // number of observations
  int<lower=1> I;  // items,  number of questions  
  int<lower=1> S;  // subjects,  number of users
  
  // data
  
  int<lower=1,upper=I> item[N];
  int<lower=1,upper=S> subject[N];
  int<lower=0,upper=1> grade[N];
}
parameters {
  // parameters
  
  real ability[S];             //  alpha: ability od student
  real  difficulty[I];          //  beta: difficulty of question
  real delta;                   // man student ability
  
}
model {

  ability ~ normal(0,1);         
  difficulty ~ normal(0,1);   
  delta ~ normal(0.75,1);
  
  for(n in 1:N)
      grade[n] ~ bernoulli_logit(ability[subject[n]] - difficulty[item[n]] + delta);
  
}
"""

## Fit or load saved model

### Load saved model

In [None]:
# with open("./data/_1pl_model_chains-2_numsamples-10000_numwarmup-1000_num_thin-1.pkl", "rb") as f: 
#     data_dict = pickle.load(f)
# posteriori = data_dict['model']
# binary_fit = data_dict['fit']

### Fit Model without cross validation

In [10]:
posteriori = stan.build(_1pl_model,data=train_data,random_seed = 2021)

Building...



Building: 33.0s, done.

In [None]:
binary_fit = posteriori.sample(num_chains=4, num_samples=1000,num_warmup=2000,num_thin=1)

### Fit Model with cross validation

#### Cross validation implement in stan model

In [None]:
_1pl_cross_val_model = """
functions {
    int[] permutation_rng(int N) {
        int y[N];
        for (n in 1:N)
            y[n] = n;
        vector[N] theta = rep_vector(1.0 / N, N);
        for (n in 1:N){
            int i = categorical_rng(theta);
            int temp = y[n];
            y[n] = y[i];
            y[i] = temp;
        }
        return y;
    }
}

data {
  // numbers of things
  
  int<lower=1> N;  // number of observations
  int<lower = 0, upper = N> N_test;
  int<lower=1> I;  // items,  number of questions  
  int<lower=1> S;  // subjects,  number of users
  
  // data
  
  int<lower=1,upper=I> item[N];
  int<lower=1,upper=S> subject[N];
  int<lower=0,upper=1> grade[N];
}
transformed data {
  int N_train = N - N_test;
  int permutation[N] = permutation_rng(N);
  // train
  int item_train[N_train] = item[permutation[1 : N_train]];
  int subject_train[N_train]  = subject[permutation[1 : N_train]];
  int grade_train[N_train]  = grade[permutation[1 : N_train]];
  int s_train = size(subject_train);
  int i_train = size(item_train);
  
  
  // test
  int item_test[N_test] = item[permutation[N_train + 1 : N]];
  int subject_test[N_test] = subject[permutation[N_train + 1 : N]];
  int grade_test[N_test] = grade[permutation[N_train + 1 : N]];
}
parameters {
  // parameters
  real ability[s_train];             //  alpha: ability od student
  real difficulty[i_train];          //  beta: difficulty of question
  real delta;                   // man student ability
  
}
model {

  ability ~ normal(0,1);         
  difficulty ~ normal(0,1);   
  delta ~ normal(0.75,1);
  
  for (n in 1:N_train) {
      grade_train[n] ~ bernoulli_logit(ability[subject_train[n]] - difficulty[item_train[n]] + delta);
  }
  
}
generated quantities {
  int<lower=0,upper=1> y_pred[N_test];
  int<lower=0,upper=1> y[N_test];
  y = grade_test;
  
  for (n in 1:N_test) {
      y_pred[n] = bernoulli_logit_rng(ability[subject_test[n]] - difficulty[item_test[n]] + delta);
  }
}
"""
# cv_posteriori = stan.build(_1pl_cross_val_model,data=binary_sim_data,random_seed = 2021)
# cv_binary_fit = posteriori.sample(num_chains=1, num_samples=2000,num_warmup=1000,num_thin=1)

#### Otherwise

In [None]:
# Number of chunks to partition the data into:
n_folds = 5
whole_dataset = pd.DataFrame({i: binary_sim_data[i] 
                              for i in ['item', 'subject', 'grade']})
test_dataset = whole_dataset.copy()
predictions = pd.DataFrame([])

In [None]:
for fold in range(n_folds):
    print(fold)
    test_data = test_dataset.sample(n=int(len(whole_dataset) / n_folds))
    training_data = whole_dataset.drop(test_data.index)
    this_fold_data = {'I': len(whole_dataset['item'].unique()),
                      'S': len(whole_dataset['subject'].unique()),
                      'N': len(training_data),
                      'item': training_data['item'].to_numpy(),
                      'subject': training_data['subject'].to_numpy(),
                      'grade': training_data['grade'].to_numpy(),
                      'N_new': len(test_data),
                      'items_new': test_data['item'].to_numpy(),
                      'subjects_new': test_data['subject'].to_numpy()}
    posteriori = stan.build(_1pl_model,data=this_fold_data,random_seed = 2021)
    binary_fit = posteriori.sample(num_chains=1, num_samples=100,num_warmup=5,num_thin=1)
    this_fold_predictions = pd.DataFrame(np.round(np.mean(binary_fit['y_pred'],axis=1)).astype(int),
                                         index=test_data.index)
    predict = pd.concat([predictions, this_fold_predictions],axis=0)
    predictions = predict
    test_dataset = test_dataset.drop(test_data.index)
predictions.sort_index(inplace=True)
predictions.columns = ['prediction_' + str(i) for i in predictions.columns]
output = whole_dataset.join(predictions)
output.head()

In [None]:
# crossVal validation
# az.r2_score(output['grade'],output['prediction_0'])
# print(f" mse = {mean_squared_error(output['grade'],output['prediction_0'])} \n kappa = {cohen_kappa_score(output['grade'],output['prediction_0'])} \n auc = {roc_auc_score(output['grade'],output['prediction_0'])} \n acc = {accuracy_score(output['grade'],output['prediction_0'])}   ")

###
# mae = (pd.DataFrame([abs(output[i] - output['grade']) 
#                      for i in output[predictions.columns]])
#        .mean(axis=1)
#        .mean())
# mse = (pd.DataFrame([(output[i] - output['grade']) ** 2
#                      for i in output[predictions.columns]])
#        .mean(axis=1)
#        .mean())
# print('Mean absolute error: ' + str(mae) + '\nMean square error: ' + str(mse))

### Print stanfit object

In [None]:
print(binary_fit)

In [None]:
binary_fit.to_frame()

## Visualize parameters distribution

In [15]:
#az.plot_trace(binary_fit)

In [None]:
# Export plot trace of model parameters
# axes = az.plot_trace(binary_fit)
# fig = axes.ravel()[0].figure
# fig.savefig("model_plot-trace.png")

## Diagnostic of model

In [None]:
# convert fit to inference_data
inf_data = az.convert_to_inference_data(binary_fit)

### Diagnostic az print

In [None]:
az.bfmi(inf_data)

In [None]:
rhat = az.rhat(inf_data)
print(rhat)

In [None]:
az.mcse(inf_data)

In [None]:
az.ess(inf_data)

In [None]:
az.r2_score(train_data['grade'],y_pred)

In [None]:
az.summary(inf_data)

In [None]:
inf_data.sample_stats

In [None]:
inf_data.posterior

In [None]:
inf_data.prior

In [17]:
### Diagnostic as plots

In [None]:
az.plot_energy(inf_data)

In [None]:
az.plot_ess(
    idata, kind="evolution"
)

In [None]:
extra_kwargs = {"color": "lightsteelblue"}
az.plot_ess(
    idata, kind="evolution", var_names=["ability"],
    color="royalblue", extra_kwargs=extra_kwargs
)

In [None]:
#Plot trace
param = binary_fit['ability'][0]
param2 = binary_fit['difficulty'][0]
# param = np.mean(fit['ability'],axis=1)  ==> for mean
# param2 = np.mean(fit['difficulty'],axis=1) ==> for mean

# Summary statistics ability
mean = np.mean(param)
median = np.median(param)
cred_min, cred_max = np.percentile(param, 2.5), np.percentile(param, 97.5)
# Summary statistics dificulty

mean2 = np.mean(param2)
median2 = np.median(param2)
cred_min2, cred_max2 = np.percentile(param2, 2.5), np.percentile(param2, 97.5)

plt.figure(figsize=(15,8))

# Plotting ability
plt.subplot(2,2,1)
plt.plot(param)
plt.xlabel('samples')
plt.ylabel("ability")
plt.axhline(mean, color='r', lw=2, linestyle='--')
plt.axhline(median, color='c', lw=2, linestyle='--')
plt.axhline(cred_min, linestyle=':', color='k', alpha=0.2)
plt.axhline(cred_max, linestyle=':', color='k', alpha=0.2)
plt.title("Trace et distribution postérieure pour la capacité de l'élève[0]")
plt.subplot(2,2,3)
plt.hist(param, 30, density=True); sns.kdeplot(param, shade=True)
plt.xlabel("ability")
plt.ylabel('density')
plt.axvline(mean, color='r', lw=2, linestyle='--',label='mean')
plt.axvline(median, color='c', lw=2, linestyle='--',label='median')
plt.axvline(cred_min, linestyle=':', color='k', alpha=0.2, label='95% CI')
plt.axvline(cred_max, linestyle=':', color='k', alpha=0.2)
plt.gcf().tight_layout()
plt.legend()

# Plotting dificulty
plt.subplot(2,2,2)
plt.plot(param2)
plt.xlabel('samples')
plt.ylabel("ability")
plt.axhline(mean2, color='r', lw=2, linestyle='--')
plt.axhline(median2, color='c', lw=2, linestyle='--')
plt.axhline(cred_min2, linestyle=':', color='k', alpha=0.2)
plt.axhline(cred_max2, linestyle=':', color='k', alpha=0.2)
plt.title("Trace et distribution postérieure pour la difficulté de l'item[0]")
plt.subplot(2,2,4)
plt.hist(param2, 30, density=True); sns.kdeplot(param2, shade=True)
plt.xlabel("difficulty")
plt.ylabel('density')
plt.axvline(mean2, color='r', lw=2, linestyle='--',label='mean')
plt.axvline(median2, color='c', lw=2, linestyle='--',label='median')
plt.axvline(cred_min2, linestyle=':', color='k', alpha=0.2, label='95% CI')
plt.axvline(cred_max2, linestyle=':', color='k', alpha=0.2)
plt.gcf().tight_layout()
plt.legend()
# save trace plot
plt.savefig("./final_data/params_posterior_distribution.png",bbox_inches='tight', pad_inches=0.5)

## Evaluation and validation 

In [None]:
ability = np.mean(binary_fit['ability'],axis=1)
difficulty = np.mean(binary_fit['difficulty'],axis=1)

In [None]:
y_pred = [] #809660
for i in range(0,10000):
    diff = binary_sim_data['item'][i]
    abilt = binary_sim_data['subject'][i]
    p = np.exp(ability[abilt - 1 ] - difficulty[diff - 1])/(1+np.exp(ability[abilt - 1] - difficulty[diff - 1]))
    y_pred.append(p)
y_pred = np.round(y_pred).astype(int)
y_pred

In [None]:
print(f" mse = {mean_squared_error(binary_sim_data['grade'],y_pred)} \n kappa = {cohen_kappa_score(binary_sim_data['grade'],y_pred)} \n auc = {roc_auc_score(binary_sim_data['grade'],y_pred)} \n acc = {accuracy_score(binary_sim_data['grade'],y_pred)}   ")

## Save Model

In [None]:
with open("./final_data/_1pl_model_chains-2_numsamples-10000_numwarmup-1000_num_thin-1.pkl", "wb") as f:
    pickle.dump({'model' : posteriori, 'fit' : binary_fit}, f, protocol=-1)

# 2PL Model

In [None]:
_2pl_model = """
data {
  // numbers of things
  
  int<lower=1> N;  // number of observations
  int<lower=1> I;  // items,  number of questions  
  int<lower=1> S;  // subjects,  number of users
  
  // data
  
  int<lower=1,upper=I> item[N];
  int<lower=1,upper=S> subject[N];
  int<lower=0,upper=1> grade[N];
}
parameters {
  // parameters
  
  vector[S] ability;             //  alpha ability od student
  vector[I] difficulty;          //  beta difficulty of question
  vector<lower=0>[I] discrimination;      // discrimination of question
  real mu_difficulty;
}
model {
  ability ~ std_normal();         
  difficulty ~ std_normal();   
  discrimination ~ lognormal(0,1);
  mu_difficulty ~ cauchy(0,5);
  
  grade ~ bernoulli_logit(discrimination[item] .* (ability[subject] - (difficulty[item] + mu_difficulty)));
  
}
"""

In [None]:
posteriori2 = stan.build(_2pl_model,data=binary_sim_data,random_seed = 2021)

In [None]:
binary_fit2 = posteriori2.sample(num_chains=1, num_samples=200,num_warmup=100,num_thin=1)

In [None]:
az.plot_trace(binary_fit2)

In [None]:
ability2 = np.mean(binary_fit2['ability'],axis=1)
difficulty2 = np.mean(binary_fit2['difficulty'],axis=1)
discrimination2 = np.mean(binary_fit2['discrimination'],axis=1)

In [None]:
y_pred2 = []
for i in range(0,10000):
    diff = binary_sim_data['item'][i]
    abilt = binary_sim_data['subject'][i]
    p = np.exp(discrimination2[diff-1]*(ability2[abilt - 1 ] - difficulty2[diff - 1]))/(1+np.exp(discrimination2[diff-1]*(ability2[abilt - 1] - difficulty2[diff - 1])))
    y_pred2.append(p)
y_pred2 = np.round(y_pred2).astype(int)
y_pred2

In [None]:
print(f" mse = {mean_squared_error(binary_sim_data['grade'],y_pred2)} \n kappa = {cohen_kappa_score(binary_sim_data['grade'],y_pred2)} \n auc = {roc_auc_score(binary_sim_data['grade'],y_pred2)} \n acc = {accuracy_score(binary_sim_data['grade'],y_pred2)}   ")

## Save Model

In [None]:
with open("./final_data/_2pl_model_chains-2_numsamples-10000_numwarmup-1000_num_thin-1.pkl", "wb") as f:
    pickle.dump({'model' : posteriori2, 'fit' : binary_fi2t}, f, protocol=-1)

# 3PL Model

In [None]:
_3pl_model = """
data {
  // numbers of things
  
  int<lower=1> N;  // number of observations
  int<lower=1> I;  // items,  number of questions  
  int<lower=1> S;  // subjects,  number of users
  
  // data
  
  int<lower=1,upper=I> item[N];
  int<lower=1,upper=S> subject[N];
  int<lower=0,upper=1> grade[N];
}
parameters {
  // parameters
  
  vector[S] ability;             //  alpha ability od student
  vector[I] difficulty;          //  beta difficulty of question
  vector<lower=0>[I] discrimination;      // discrimination of question
  vector<lower=0,upper=1>[I] guessing;
  real mu_difficulty;
}
model {
  ability ~ std_normal();         
  difficulty ~ std_normal();   
  discrimination ~ lognormal(0,1);
  guessing ~ beta(5,17);
  mu_difficulty ~ cauchy(0,5);
  

  grade ~ bernoulli_logit(guessing[item] + ((1-guessing[item]).*(inv_logit(discrimination[item] .* (ability[subject] - (difficulty[item] + mu_difficulty))))));
  

}
"""

In [None]:
posteriori3 = stan.build(_3pl_model,data=binary_sim_data,random_seed = 2021)

In [None]:
binary_fit3 = posteriori3.sample(num_chains=1, num_samples=2000,num_warmup=1000,num_thin=1)

In [None]:
az.plot_trace(binary_fit3)

In [None]:
ability3 = np.mean(binary_fit3['ability'],axis=1)
difficulty3 = np.mean(binary_fit3['difficulty'],axis=1)
discrimination3 = np.mean(binary_fit3['discrimination'],axis=1)
guessing3 = np.mean(binary_fit3['guessing'],axis=1)

In [None]:
y_pred3 = []
for i in range(0,10000):
    diff = binary_sim_data['item'][i]
    abilt = binary_sim_data['subject'][i]
    p = guessing3[diff-1] + ((1-guessing3[diff-1]) / (1 + np.exp(-discrimination3[diff-1]*(ability3[abilt - 1] - difficulty3[diff - 1]))))
    y_pred3.append(p)
y_pred3 = np.round(y_pred3).astype(int)
y_pred3

In [None]:
print(f" mse = {mean_squared_error(binary_sim_data['grade'],y_pred3)} \n kappa = {cohen_kappa_score(binary_sim_data['grade'],y_pred3)} \n auc = {roc_auc_score(binary_sim_data['grade'],y_pred3)} \n acc = {accuracy_score(binary_sim_data['grade'],y_pred3)}   ")

## Save Model

In [13]:
with open("./final_data/_3pl_model_chains-2_numsamples-10000_numwarmup-1000_num_thin-1.pkl", "wb") as f:
    pickle.dump({'model' : posteriori3, 'fit' : binary_fit3}, f, protocol=-1)

NameError: name 'posteriori' is not defined

# Summary of all Model

In [None]:
plt.figure(0).clf()

fpr, tpr, thresh = metrics.roc_curve(train_data['grade'],y_pred)
auc = metrics.roc_auc_score(train_data['grade'],y_pred)
plt.plot(fpr,tpr,label="1pl model 10000 itr, auc="+str(auc))

# pred = np.random.rand(1000)
# label = np.random.randint(2, size=1000)
# fpr, tpr, thresh = metrics.roc_curve(label, pred)
# auc = metrics.roc_auc_score(label, pred)
# plt.plot(fpr,tpr,label="data 2, auc="+str(auc))

plt.legend(loc=0)