In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools, os, torch

from sim import Scheduler, Random, Leitner
from data_process import process_original, reduce_df

%load_ext autoreload
%autoreload 2

## Load data (clean if necesarry)

In [2]:
if not os.path.exists("data/cleaned.csv"):
    process_original()
    

df = pd.read_csv("data/cleaned.csv")
reduce_df(df)

## Data Exploration

In [48]:
lang_map = {'de' : 0, 'en': 1, 'es': 2, 'fr': 3, 'it': 4, 'pt': 5}
l_map = pd.read_csv("data/lexeme_map.csv")

df.groupby('learning_language').count().loc[:, 'user_id']

learning_language
0    1452597
1    5014791
2    3407689
3    1873734
4     793935
5     311480
Name: user_id, dtype: int64

We have about 5 million english items, 3 million Spanish 1.9 million French and 1.4 million German. Italian and Portugese each have hundreds of thousands. It would be useful to restrict out studies to just the English users so we reduce the dimensionality of our action and state spaces.

There are 43.8 thousand learners (trajectories) we have to provide our RL agents.



In [49]:
df_en = df.loc[df['learning_language'] == 1].copy()
df_en = df_en.drop(['learning_language'], axis=1)
reduce_df(df_en)
df_en.loc[:, 'difficulty'] = df_en.loc[:, 'difficulty'].astype(np.float32)

In [92]:
english_counts = df_en.groupby('lexeme_id').count().loc[:, 'timestamp']
n_lex = len(english_counts)
print(f"There are {n_lex} lexemes")

There are 2983 lexemes


In [141]:
def eval_thresh(df, counts, thresh):
    above, below = sum(counts >= thresh), sum(counts  < thresh)
    total = above + below
    
    excl = sum(counts[counts < thresh])
    incl = sum(counts[counts >= thresh])
    total2 = incl + excl
    
    print(f"For threshold {thresh} there are {100 * above/total:.2f}% lexemes above and {100 * below/total:.2f}% below\n")
    print(f"There would be {100 * incl/total2:.2f}% of data included and {100 * excl/total2:.2f}% of data excluded")
    
def reduce_lexemes(df, amt):
    """
    Removes all rows of lexemes that appear less than a certain amount.
    """
    cnts = df.groupby('lexeme_id').count().loc[:, 'timestamp']
    cnts_incl = cnts >= amt
    cnts_incl = cnts_incl.index[cnts_incl]
    cnts_incl = set(cnts_incl)
    df = df[df.lexeme_id.isin(cnts_incl)]
    return df
    

In [142]:
eval_thresh(df_en, english_counts, 5e3)

For threshold 5000.0 there are 6.50% lexemes above and 93.50% below

There would be 78.40% of data included and 21.60% of data excluded


In [146]:
df_en_reduced = reduce_lexemes(df_en, 5e3)

(5014791, 3931462)