In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import deque 
import os
import torch
from torch import nn
from typing import Union
from torch import optim
from torch import distributions
import itertools

from pandas.util import hash_pandas_object

from sim import Scheduler, Random, Leitner


## Load and clean the data

In [10]:
def timestamp_to_session(x):
    result = pd.DataFrame()
    
    timestamps_sorted = np.unique(np.array(x['timestamp']))
    timestamps_sorted.sort()
    
    
    result['timestamp'] = timestamps_sorted
    result['session'] = list(range(len(timestamps_sorted)))
    return result

In [None]:
lang_map = {'de' : 0, 'en': 1, 'es': 2, 'fr': 3, 'it': 4, 'pt': 5}

if not os.path.exists("data/cleaned.csv"):
    df = pd.read_csv("data/settles.acl16.learning_traces.13m.csv")

    
    print("hashed lex id's")
    #Hash lexemes for smaller storage
    df['lexeme_id'] = (pd.DataFrame(df['lexeme_id']).apply(hash_pandas_object, axis=0) % 1000000).loc[:, 'lexeme_id']
    print('hashed user id')
    #Hash user id's for smaller storage
    df['user_id'] = (pd.DataFrame(df['user_id']).apply(hash_pandas_object, axis=0) % 5000000).loc[:, 'user_id']
    
    #Map languages to numbers for smaller storage
    df['learning_language'] = df['learning_language'].map(lang_map)
    df['ui_language'] = df['ui_language'].map(lang_map)
    
    
    df['lexeme_string'] = df.lexeme_string.map(lambda x: x[0: x.find('<')])
    
    
    df_small = df.loc[:, ['lexeme_id', 'lexeme_string']]
    df_small = df_small.drop_duplicates()
    df_small.to_csv("lexeme_map.csv", index=False)
    
    #Drop this column as it's inferred from last two
    df = df.drop(["p_recall", "lexeme_string"], axis=1)
    
    
    for c in df.columns:
            if c != 'lexeme_string':
                df.loc[:, c] = pd.to_numeric(df[c], downcast='unsigned')
    
    #This adds the item difficulties of each lexeme
    item_difficulties = df.groupby('lexeme_id').apply(lambda x: x['history_correct'].sum() / x['history_seen'].sum())
    print('timestamp dones')
    #This creates a map of user id and timestamp to which session they are learning in.
    print("1")
    timestamp_map = df.loc[:, ['user_id', 'timestamp']].groupby(['user_id']).apply(timestamp_to_session)
    print("2")
    timestamp_map = timestamp_map.reset_index().drop(['level_1'], axis = 1).loc[:, ['timestamp', 'user_id', 'session']]


    print("3")
    #Get session for each one
    df = pd.merge(df, timestamp_map,  how='left', \
                                left_on=['user_id','timestamp'], right_on = ['user_id','timestamp'])
    print('4')
    #Get difficulty for each one
    df = pd.merge(df, pd.DataFrame(item_difficulties),  how='left', \
                                left_on=['lexeme_id'], right_on = ['lexeme_id'])
    print("5")
    df = df.rename(columns={0: "difficulty"})
    print('hashing users and lexemes')
    #Hash user and lexeme
    df['user_lex_hash'] = \
            df.loc[:, ['user_id', 'lexeme_id']].apply(hash_pandas_object, axis=1)
    
    print('groupby userlex hash')
    #Get the minimum timestamp for user lex hashes and merge the tables
    min_times_per_user = \
            df.loc[:, ['user_lex_hash', 'timestamp']].groupby('user_lex_hash').min()

    df = pd.merge(df, pd.DataFrame(min_times_per_user),  how='left', \
                                left_on=['user_lex_hash'], right_on = ['user_lex_hash'])
    
    
    print('sort by user _lex')
    #Sort and take the diff
    df.sort_values(by=['user_lex_hash', 'timestamp_x'], inplace=True)
    
    df_og = df.copy()
    
    df['sess_diff'] = df['session'].diff()


    df = df.loc[df['timestamp_x'] != df['timestamp_y']]


    for c in df.columns:
            if c != 'lexeme_string':
                df.loc[:, c] = pd.to_numeric(df[c], downcast='unsigned')

    df.to_csv("data/cleaned.csv", index=False)
    
else:
    df = pd.read_csv("data/cleaned.csv")
    for c in df.columns:
        if c != 'lexeme_string':
            df[c] = pd.to_numeric(df[c], downcast='unsigned')
            
    lexeme_map = pd.read_csv("data/lexeme_map.csv")
    

hashed lex id's
hashed user id
timestamp dones
1


## Data Exploration

We have about 5 million english learners, 3 million Spanish 1.9 million French and 1.4 million German learners. Italian and Portugese each have hundreds of thousands. It would be useful to restrict out studies to just the English users so we reduce the dimensionality of our dataset.

Interestingly this dataset doesn't contain any Germans learning English so our studies will consist of using the Spanish, French and Italians.

In [None]:
df.head()

In [None]:
tbl = df.groupby('lexeme_id').count()

In [None]:
tbl

In [None]:
nums = tbl['timestamp_x']

In [None]:
max(nums)

In [None]:
tbl = df.groupby('lexeme_id').count()

nums = tbl['timestamp_x']

idxs = nums < 1000

print(sum(idxs))

print(sum(nums[idxs]))

print(sum(nums[~idxs]))

In [None]:
print(sum(nums[idxs]))

In [None]:
print(sum(nums[~idxs]))

In [None]:
get_traj(student):
    get all rows of table
    

In [17]:
df.loc[:, 'lexeme_id']

0           76390c1350a8dac31186187e2fe1e178
1           7dfd7086f3671685e2cf1c1da72796d7
2           35a54c25a2cda8127343f6a82e6f6b7d
3           0cf63ffe3dda158bc3dbd55682b355ae
4           84920990d78044db53c1b012f5bf9ab5
                          ...               
12854221    d5efc552aaea3109eb5388aa1ec8673d
12854222    a826c47947d68549fa81e19cafa57ba0
12854223    5e29d77697d23070a1fb92eb6c90e9b6
12854224    cdfecc9247566d40bb964a218c54c783
12854225    c52ab45d4e22ee7580041911159e3c0c
Name: lexeme_id, Length: 12854226, dtype: object

In [18]:
df.head()

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,1.0,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,6,4,2,2
1,0.5,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,4,4,2,1
2,1.0,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,5,4,1,1
3,0.5,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,6,5,2,1
4,1.0,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,4,4,1,1


In [23]:
pd.DataFrame(df.loc[:, 'lexeme_id']).apply(hash_pandas_object) % 10

Unnamed: 0,lexeme_id
0,4
1,8
2,1
3,2
4,3
...,...
12854221,4
12854222,7
12854223,2
12854224,0
