In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools, os, torch

from sim import Scheduler, Random, Leitner
from data_process import process_original

%load_ext autoreload
%autoreload 2

## Load and clean the data

In [None]:
process_original()

In [None]:
lang_map = {'de' : 0, 'en': 1, 'es': 2, 'fr': 3, 'it': 4, 'pt': 5}

df = pd.read_csv("data/settles.acl16.learning_traces.13m.csv")


#Hash strings and map languages to numbers
assign_colstring_to_num(df, 'user_id')
assign_colstring_to_num(df, 'lexeme_id')



df['learning_language'] = df['learning_language'].map(lang_map)
df['ui_language'] = df['ui_language'].map(lang_map)


#Save lexemes in different table. Downcast all data
df['lexeme_string'] = df.lexeme_string.map(lambda x: x[0: x.find('<')])  
lex_map = df.loc[:, ['lexeme_id', 'lexeme_string']]
lex_map = lex_map.drop_duplicates()
lex_map.to_csv("data/lexeme_map.csv", index=False)
lex_map=None
#Trim our dataframe
df = df.drop(["p_recall", "lexeme_string"], axis=1)
reduce_df(df)

#Get difficulties for each item and join that.
i_d = df.groupby('lexeme_id').apply(\
    lambda x: x['history_correct'].sum() / x['history_seen'].sum())
df = df.join(i_d.rename("difficulty"), on='lexeme_id')

#Hash user_lex and ts_user combos, sort by user and ts
df['lex_user'] = df.loc[:, 'user_id'].astype(str).apply(hash) + df.loc[:, 'lexeme_id'].astype(str).apply(hash)
df['ts_user'] = df.loc[:, 'user_id'].astype(str).apply(hash) + df.loc[:, 'timestamp'].astype(str).apply(hash)

df = df.sort_values(by=['user_id', 'timestamp'])

#Create sessions table and join it with original
ts_cntr = df.loc[:, ['user_id', 'timestamp']].\
                 groupby(['user_id']).apply(timestamp_to_session)

ts_cntr.index = ts_cntr.index.droplevel(1)
ts_cntr = ts_cntr.reset_index()
ts_cntr['ts_user'] = ts_cntr.loc[:, 'user_id'].astype(str).apply(hash) \
                        + ts_cntr.loc[:, 'timestamp'].astype(str).apply(hash)

df = df.merge(ts_cntr, right_on=['timestamp', 'user_id'], left_on=['timestamp', 'user_id'])

df = df.drop(['ts_user_x'], axis=1)

assign_colstring_to_num(df, 'lex_user')
assign_colstring_to_num(df, 'ts_user_y')

df.loc[:, 'difficulty'] = df.loc[:, 'difficulty'].astype(np.float32)
reduce_df(df)

df.to_csv("data/cleaned.csv")

## Data Exploration

We have about 5 million english learners, 3 million Spanish 1.9 million French and 1.4 million German learners. Italian and Portugese each have hundreds of thousands. It would be useful to restrict out studies to just the English users so we reduce the dimensionality of our dataset.

Interestingly this dataset doesn't contain any Germans learning English so our studies will consist of using the Spanish, French and Italians.