In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools, os, torch

from sim import Scheduler, Random, Leitner
from data_process import process_original, reduce_df, eval_thresh, reduce_lexemes
from get_trajectory import get_traj

%load_ext autoreload
%autoreload 2

## Load data (clean if necesarry)

In [2]:
if not os.path.exists("data/cleaned.csv"):
    process_original()
    

df = pd.read_csv("data/cleaned.csv")

## Data Exploration

In [3]:
lang_map = {'de' : 0, 'en': 1, 'es': 2, 'fr': 3, 'it': 4, 'pt': 5}
l_map = pd.read_csv("data/lexeme_map.csv")

df.groupby('learning_language').count().loc[:, 'user_id']

learning_language
0    1452597
1    5014791
2    3407689
3    1873734
4     793935
5     311480
Name: user_id, dtype: int64

We have about 5 million english items, 3 million Spanish 1.9 million French and 1.4 million German. Italian and Portugese each have hundreds of thousands. It would be useful to restrict out studies to just the English users so we reduce the dimensionality of our action and state spaces.

There are 43.8 thousand learners (trajectories) we have to provide our RL agents.



In [4]:
df = df.loc[df['learning_language'] == 1].copy()
df = df.drop(['learning_language'], axis=1)
reduce_df(df)
df.loc[:, 'difficulty'] = df.loc[:, 'difficulty'].astype(np.float32)

In [5]:
english_counts = df.groupby('lexeme_id').count().loc[:, 'timestamp']
n_lex = len(english_counts)
print(f"There are {n_lex} lexemes")

There are 2983 lexemes


In [6]:
n_items = int(500)

eval_thresh(df, english_counts, n_items)

For threshold 500 there are 24.81% lexemes above and 75.19% below

There would be 94.46% of data included and 5.54% of data excluded


In [None]:
idx_to_lex , lex_to_idx = {}, {}
df, included = reduce_lexemes(df, n_items)

get_traj(df, included)


# i = 0
# for item in included:
#     idx_to_lex[i] = item
#     lex_to_idx[item] = i
#     i += 1



# df = df.sort_values(by=['user_id', 'timestamp'])

# df_first_lex = df.groupby('lex_user').head(1)
# max_sess = df.groupby('user_id').max().loc[:, 'session']
# min_sess = df.groupby('user_id').min().loc[:, 'session']


In [None]:
states = {}
actions = {}


itr = max_sess.items()
itr2 = min_sess.items()


while True:
    try:
        usr, mx = next(itr)
        _, mn = next(itr2)
        
        sessions = int(mx - mn)
        states[usr] = np.zeros((sessions + 1, len(included) * 3))
        actions[usr] = np.zeros((sessions + 1, len(included)))
    except:
        break



In [None]:
for r in df_first_lex.itertuples(index=False):
    sess, usr, lex = r.session, r.user_id, r.lexeme_id
    
    h_seen, h_corr = r.history_seen, r.history_correct
    s_seen = r.session_seen
    
    c = lex_to_idx[lex]
    c_s = c * 3
    
    states[usr][0, c_s] = h_seen
    states[usr][0, c_s+1] = h_corr
    try:
        states[usr][1, c_s+2] = -1  
    except:
        pass
    
    actions[usr][0, c] = s_seen
    

        

In [None]:
add_arr = np.array([0] * len(states[3][0, :]))

for i in range(len(add_arr)):
    if i % 3 == 2:
        add_arr[i] = 1

In [None]:
last_usr = None
l_sess = None
for r in df.itertuples(index=True):
    usr, sess, lex, s_seen, s_corr = r.user_id, r.session, \
        r.lexeme_id, r.session_seen, r.session_correct
    
    m_sess, ma_sess = min_sess[usr], max_sess[usr]

    
    c = lex_to_idx[lex]
    c_s = c * 3
    row = sess - m_sess
    
    actions[usr][0, c] = s_seen
    
    
    if sess != l_sess:
        l_sess = sess
        states[usr][row, :] = np.copy(states[usr][row - 1, :]) + add_arr   
    
    if sess != ma_sess:
        states[usr][row + 1, c_s] += s_seen
        states[usr][row + 1, c_s +1] += s_corr
        #Set to -1 so when we add 1 to it it goes back to 0
        states[usr][row + 1, c_s + 2] = -1
        
        

In [None]:
get_traj(df, in)