In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools, os, torch

from sim import Scheduler, Random, Leitner
from data_process import process_original, reduce_df, eval_thresh, reduce_lexemes
from get_trajectory import get_traj

%load_ext autoreload
%autoreload 2

In [10]:
!cd data && ls

cleaned.csv    lexeme_map.csv


## Load data (clean if necesarry)

In [11]:
if not os.path.exists("data/cleaned.csv"):
    process_original()
    

df = pd.read_csv("data/cleaned.csv")

## Data Exploration

In [12]:
lang_map = {'de' : 0, 'en': 1, 'es': 2, 'fr': 3, 'it': 4, 'pt': 5}
l_map = pd.read_csv("data/lexeme_map.csv")

df.groupby('learning_language').count().loc[:, 'user_id']

learning_language
0    1452597
1    5014791
2    3407689
3    1873734
4     793935
5     311480
Name: user_id, dtype: int64

We have about 5 million english items, 3 million Spanish 1.9 million French and 1.4 million German. Italian and Portugese each have hundreds of thousands. It would be useful to restrict out studies to just the English users so we reduce the dimensionality of our action and state spaces.

There are 43.8 thousand learners (trajectories) we have to provide our RL agents.



In [13]:
df = df.loc[df['learning_language'] == 1].copy()
df = df.drop(['learning_language'], axis=1)
reduce_df(df)
df.loc[:, 'difficulty'] = df.loc[:, 'difficulty'].astype(np.float32)


df.loc[:, 'difficulty'] -= df.loc[:, 'difficulty'].mean()
df.loc[:, 'difficulty'] /= df.loc[:, 'difficulty'].std()

In [14]:
english_counts = df.groupby('lexeme_id').count().loc[:, 'timestamp']
n_lex = len(english_counts)
print(f"There are {n_lex} lexemes")

There are 2983 lexemes


In [15]:
n_items = int(500)

eval_thresh(df, english_counts, n_items)

For threshold 500 there are 24.81% lexemes above and 75.19% below

There would be 94.46% of data included and 5.54% of data excluded


In [16]:
idx_to_lex , lex_to_idx = {}, {}
df, included = reduce_lexemes(df, n_items)

In [55]:
states, actions, itl, lti = get_traj(df, included)


In [54]:
d = df.sort_values(by=['lex_user'])
df.loc[(df['user_id'] == 3) & (df['lexeme_id'] == 147)].sort_values(by='timestamp')

Unnamed: 0,timestamp,user_id,ui_language,lexeme_id,history_seen,history_correct,session_seen,session_correct,difficulty,lex_user,session,ts_user
367,1362687858,3,5,147,184,164,2,2,-0.216559,207,21,41
781,1363100819,3,5,147,186,166,2,2,-0.216559,207,59,79
802,1363101153,3,5,147,188,168,1,1,-0.216559,207,60,80
806,1363101527,3,5,147,189,169,1,1,-0.216559,207,61,81


In [51]:
c = lti[147]
c_s = c * 4

In [52]:
s = states[3][:, c_s: c_s+4]
s

array([[184.        , 164.        ,  -0.21655871,   0.        ],
       [184.        , 164.        ,  -0.21655871,   1.        ],
       [184.        , 164.        ,  -0.21655871,   2.        ],
       [184.        , 164.        ,  -0.21655871,   3.        ],
       [184.        , 164.        ,  -0.21655871,   4.        ],
       [184.        , 164.        ,  -0.21655871,   5.        ],
       [184.        , 164.        ,  -0.21655871,   6.        ],
       [184.        , 164.        ,  -0.21655871,   7.        ],
       [184.        , 164.        ,  -0.21655871,   8.        ],
       [184.        , 164.        ,  -0.21655871,   9.        ],
       [184.        , 164.        ,  -0.21655871,  10.        ],
       [184.        , 164.        ,  -0.21655871,  11.        ],
       [184.        , 164.        ,  -0.21655871,  12.        ],
       [184.        , 164.        ,  -0.21655871,  13.        ],
       [184.        , 164.        ,  -0.21655871,  14.        ],
       [184.        , 164

In [53]:
actions[3][:, c]

array([2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 2., 1., 1., 0.])