In [2]:
from collections import defaultdict, namedtuple
import numpy as np
import pandas as pd
import math
import random

In [3]:
import duolingo_replica as dr
import duolingo_original as do

In [5]:
%run clean_data_en_lstm
%run clean_features

We load the data and select a subset of 5000 observations to test the models.

In [6]:
# data   = pd.read_csv('https://www.dropbox.com/scl/fi/pnxa2jv4xf23bfwry1q9x/learning_traces.13m.csv?rlkey=2dt9848lutbgyys5sujq8dgw2&dl=1' )

file_path = "C:/Users/Alexander/Dropbox/halflife_regression_rl/0_data/learning_traces.13m.csv"
subset = process_data(file_path, 20)
# subset = data.sample( n = 5000, random_state = 5 )
# subset = data[data["learning_language"] == "en"]


In [7]:
# subset = data[data["learning_language"] == "en"]
subset

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct,appearance_count
0,1.0,1362082504,357,u:dwbJ,en,pt,1052c3ace653dbc8923eaa183bc02b88,definition/definition<n><sg>,17,17,2,2,629
1,1.0,1362082504,357,u:dwbJ,en,pt,9cba1b30f88bf3c047b22cffcaf88c12,surface/surface<n><sg>,19,19,3,3,629
2,1.0,1362082504,357,u:dwbJ,en,pt,961cd149f20f2571419b1412d849f19a,scale/scale<n><sg>,21,20,3,3,629
3,0.8,1362082504,357,u:dwbJ,en,pt,5cbb1249562e95794a4c4ae0e2d8ae26,temperature/temperature<n><sg>,44,36,5,4,629
4,1.0,1362082504,357,u:dwbJ,en,pt,2df65bdf80d10d2b78d62cb2e0a731d8,distance/distance<n><sg>,21,20,3,3,629
...,...,...,...,...,...,...,...,...,...,...,...,...,...
569115,1.0,1363104881,4294673,u:f_W4,en,pt,c032182c0ffb744c963ec83c937a26f4,us/prpers<prn><obj><p1><mf><pl>,10,9,1,1,35
569116,1.0,1363104881,4666611,u:f_W4,en,pt,80765ae2a08ba6812a4ab9df607b97d2,am/be<vbser><pri><p1><sg>,21,19,1,1,35
569117,0.5,1363104881,3543733,u:f_W4,en,pt,6d4c572af8022cb4784ce0f8898d1905,to/to<pr>,4,4,2,1,35
569118,1.0,1363104881,3610474,u:f_W4,en,pt,c9fb923e49d5cba24b5afb9ee1cff2a9,wine/wine<n><sg>,21,18,1,1,35


# 1. LOGIT

## 1.1. Replication

We perform the train-test split and, in addition, obtain the list of predictor variables. The predictor variables include: 'right,' 'wrong,' 'bias,', 'time' and dummy variables for lexemes.

In [8]:
trainset, testset, feature_vars = dr.read_data( subset, method = 'lr', omit_lexemes = False )

We define and fit a Logistic Regression Model for the replication code

In [9]:
%%time

model_1 = dr.logit_model( feature_columns = feature_vars )
model_1.train( trainset )

Wall time: 3h 49min 49s


We evaluate the model with the test set and obtain results. The h_seed allows replicating the values of h_hat, which are random for logistic regression

In [None]:
model_1.test_model( testset, h_seed = 5 )

We export theta values for the Logit replication model

In [None]:
model_1.dump_theta( 'logit_replication_thetas.txt' )

## 1.2. Original code

We perform the train-test split and, in addition, obtain the list of predictor variables. The predictor variables include: 'right,' 'wrong,' 'bias,' and dummy variables for lexemes.

In [None]:
trainset2, testset2 = do.read_data( subset, method = 'lr' )

We define and fit a Logistic Regression Model for the original code

In [None]:
%%time

model_2 = do.SpacedRepetitionModel( method = 'lr' )
model_2.train( trainset2 )

We evaluate the model with the test set and obtain results. The first value on the right corresponds to the metric 'Total Loss.' The other metrics are named accordingly. The metrics are almost exactly the same as in the replication code.

In [None]:
model_2.eval( testset2, h_seed = 5 )

We export theta values for the Logit original model. Thetas are almost exactly the same as in the replication code.

In [None]:
model_2.dump_weights( 'logit_original_thetas.txt' )

# 2. HLR Model

## 2.1. Replication

We perform the train-test split and, in addition, obtain the list of predictor variables. The predictor variables include: 'right,' 'wrong,' 'bias,' and dummy variables for lexemes. Unlike Model 1, in this case, the variable 'time' is not included as a predictor.

In [None]:
trainset3, testset3, feature_vars3 = dr.read_data( subset, method = 'hlr', omit_lexemes = False )

We train the HLR replication model

In [None]:
%%time

model_3 = dr.HLR_model( feature_columns = feature_vars3, omit_h_term = True )
model_3.train( trainset3 )

We evaluate the model with the test set and obtain results

In [None]:
model_3.test_model( testset3 )

We export theta values for the Logit replication model

In [None]:
model_3.dump_theta( 'hlr_replication_thetas.txt' )

## 2.2. Original

We perform the train-test split and, in addition, obtain the list of predictor variables. The predictor variables include: 'right,' 'wrong,' 'bias,' and dummy variables for lexemes. Unlike Model 3, in this case, the variable 'time' is not included as a predictor.

In [None]:
trainset2, testset2 = do.read_data( subset, method = 'hlr' )

We fit the HLR original model. We omit h_term.

In [None]:
%%time

model_4 = do.SpacedRepetitionModel( method = 'hlr', omit_h_term = True )
model_4.train( trainset2 )

We evaluate the model with the test set and obtain results. The first value on the right corresponds to the metric 'Total Loss.' The other metrics are named accordingly. The metrics are almost exactly the same as in the replication code.

In [None]:
model_4.eval( testset2, h_seed = 5 )

We export theta values for the HLR original model. Thetas are almost exactly the same as in the replication code.

In [None]:
model_4.dump_weights( 'hlr_original_thetas.txt' )