In [1]:
from collections import defaultdict, namedtuple
import numpy as np
import pandas as pd
import math
import random

In [2]:
import duolingo_replica as dr
import duolingo as d

We load the data and select a subset of 5000 observations to test the models.

In [3]:
data   = pd.read_csv('learning_traces.13m_en_10.csv' )
subset = data.sample( n = 5000, random_state = 5 )

# 1. LOGIT

## 1.1. Replication

We perform the train-test split and, in addition, obtain the list of predictor variables. The predictor variables include: 'right,' 'wrong,' 'bias,', 'time' and dummy variables for lexemes.

In [4]:
trainset, testset, feature_vars = dr.read_data( subset, method = 'lr', omit_lexemes = False )

We define and fit a Logistic Regression Model for the replication code

In [5]:
%%time

model_1 = dr.logit_model( feature_columns = feature_vars )
model_1.train( trainset )

CPU times: total: 38.2 s
Wall time: 39.1 s


We evaluate the model with the test set and obtain results

In [6]:
model_1.test_model( testset, h_seed = 5 )

-----------------------------
            Results          
-----------------------------
Total Loss : 188763.186
p          : 65.696
h          : 188697.488
l2         : 0.002
mae (p)    : 0.332
cor (p)    : -0.022
mae (h)    : 149.518
cor (h)    : -0.024
-----------------------------


We export theta values for the Logit replication model

In [9]:
model_1.dump_theta( 'logit_replication_thetas.txt' )

## 1.2. Original code

We perform the train-test split and, in addition, obtain the list of predictor variables. The predictor variables include: 'right,' 'wrong,' 'bias,' and dummy variables for lexemes.

In [10]:
trainset2, testset2 = d.read_data( subset, method = 'lr' )

done!


We define and fit a Logistic Regression Model for the original code

In [11]:
%%time

model_2 = d.SpacedRepetitionModel( method = 'lr' )
model_2.train( trainset2 )

CPU times: total: 46.9 ms
Wall time: 58.2 ms


We evaluate the model with the test set and obtain results. The first value on the right corresponds to the metric 'Total Loss.' The other metrics are named accordingly. The metrics are almost exactly the same as in the replication code.

In [12]:
model_2.eval( testset2, h_seed = 5 )

188763.117 (p=65.626, h=188697.488, l2=0.002)	mae(p)=0.332	cor(p)=-0.022	mae(h)=149.518	cor(h)=-0.024


We export theta values for the Logit original model. Thetas are almost exactly the same as in the replication code.

In [13]:
model_2.dump_weights( 'logit_original_thetas.txt' )

# 2. HLR Model

## 2.1. Replication

We perform the train-test split and, in addition, obtain the list of predictor variables. The predictor variables include: 'right,' 'wrong,' 'bias,' and dummy variables for lexemes. Unlike Model 1, in this case, the variable 'time' is not included as a predictor.

In [15]:
trainset3, testset3, feature_vars3 = dr.read_data( subset, method = 'hlr', omit_lexemes = False )

We train the HLR replication model

In [16]:
%%time

model_3 = dr.HLR_model( feature_columns = feature_vars3, omit_h_term = True )
model_3.train( trainset3 )

CPU times: total: 41.6 s
Wall time: 41.9 s


We evaluate the model with the test set and obtain results

In [17]:
model_3.test_model( testset3 )

-----------------------------
            Results          
-----------------------------
Total Loss : 188049.037
p          : 180.259
h          : 187868.778
l2         : 0.000
mae (p)    : 0.431
cor (p)    : 0.011
mae (h)    : 149.080
cor (h)    : -0.092
-----------------------------


We export theta values for the Logit replication model

In [18]:
model_3.dump_theta( 'hlr_replication_thetas.txt' )

## 2.2. Original

We perform the train-test split and, in addition, obtain the list of predictor variables. The predictor variables include: 'right,' 'wrong,' 'bias,' and dummy variables for lexemes. Unlike Model 3, in this case, the variable 'time' is not included as a predictor.

In [19]:
trainset2, testset2 = d.read_data( subset, method = 'hlr' )

done!


We fit the HLR original model. We omit h_term.

In [20]:
%%time

model_4 = d.SpacedRepetitionModel( method = 'hlr', omit_h_term = True )
model_4.train( trainset2 )

CPU times: total: 46.9 ms
Wall time: 46.2 ms


We evaluate the model with the test set and obtain results. The first value on the right corresponds to the metric 'Total Loss.' The other metrics are named accordingly. The metrics are almost exactly the same as in the replication code.

In [21]:
model_4.eval( testset2, h_seed = 5 )

188048.806 (p=180.252, h=187868.554, l2=0.000)	mae(p)=0.431	cor(p)=0.011	mae(h)=149.080	cor(h)=-0.092


We export theta values for the HLR original model. Thetas are almost exactly the same as in the replication code.

In [22]:
model_4.dump_weights( 'hlr_original_thetas.txt' )