In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import deque 
import os
import torch
from torch import nn
from typing import Union
from torch import optim
from torch import distributions
import itertools

from hashlib import sha256


## Load and clean the data

In [2]:
def timestamp_to_session(x):
    result = pd.DataFrame()
    
    timestamps_sorted = np.array(sorted(list(set(x['timestamp']))))    
    
    result['timestamp'] = timestamps_sorted
    result['session'] = list(range(len(timestamps_sorted)))
    return result

In [3]:
lang_map = {'de' : 0, 'en': 1, 'es': 2, 'fr': 3, 'it': 4, 'pt': 5}

if not os.path.exists("data/cleaned.csv"):
    df = pd.read_csv("data/settles.acl16.learning_traces.13m.csv")

    

    #Hash lexemes for smaller storage
    df['lexeme_id'] = df['lexeme_id'].apply(sha256) % 1000000
    
    #Hash user id's for smaller storage
    df['user_id'] = df['user_id'].apply(sha256) % 5000000
    
    #Map languages to numbers for smaller storage
    df['learning_language'] = df['learning_language'].map(lang_map)
    df['ui_language'] = df['ui_language'].map(lang_map)
    
    
    df['lexeme_string'] = df.lexeme_string.map(lambda x: x[0: x.find('<')])
    
    
    df_small = df.loc[:, ['lexeme_id', 'lexeme_string']]
    df_small = df_small.drop_duplicates()
    df_small.to_csv("lexeme_map.csv", index=False)
    
    #Drop this column as it's inferred from last two
    df = df.drop(["p_recall", "lexeme_string"], axis=1)
    
    #This table contains the item difficulties of each lexeme
    item_difficulties = df.groupby('lexeme_id').apply(lambda x: x['history_correct'].sum() / x['history_seen'].sum())

    #This table contains user_id, timestamp, and which session the timestamp corresponds to
    timestamp_map = df.loc[:, ['user_id', 'timestamp']].groupby(['user_id']).apply(timestamp_to_session)
    timestamp_map = timestamp_map.reset_index().drop(['level_1'], axis = 1).loc[:, ['timestamp', 'user_id', 'session']]


    
    #Get session for each one
    df = pd.merge(df, timestamp_map,  how='left', \
                                left_on=['user_id','timestamp'], right_on = ['user_id','timestamp'])

    #Get difficulty for each one
    df = pd.merge(df, pd.DataFrame(item_difficulties),  how='left', \
                                left_on=['lexeme_id'], right_on = ['lexeme_id'])

    df = df.rename(columns={0: "difficulty"})

    #Hash user and lexeme
    df['user_lex_hash'] = \
            pd.Series(df.loc[:, ['user_id', 'lexeme_id']].astype(str).values.sum(axis=1)).apply(hash)
    
    
    #Get the minimum timestamp for user lex hashes and merge the tables
    min_times_per_user = \
            df.loc[:, ['user_lex_hash', 'timestamp']].groupby('user_lex_hash').min()

    df = pd.merge(df, pd.DataFrame(min_times_per_user),  how='left', \
                                left_on=['user_lex_hash'], right_on = ['user_lex_hash'])
    
    #Sort and take the diff
    df.sort_values(by=['user_lex_hash', 'timestamp_x'], inplace=True)
    
    df['sess_diff'] = df['session'].diff()


    df = df.loc[df['timestamp_x'] != df['timestamp_y']]


    for c in df.columns:
            if c != 'lexeme_string':
                df.loc[:, c] = pd.to_numeric(df[c], downcast='unsigned')

    df.to_csv("data/cleaned.csv", index=False)
    
else:
    df = pd.read_csv("data/cleaned.csv")
    for c in df.columns:
        if c != 'lexeme_string':
            df[c] = pd.to_numeric(df[c], downcast='unsigned')
            
    lexeme_map = pd.read_csv("data/lexeme_map.csv")
    

## Scheduling Simulator

In [4]:
class Scheduler:
    """
    Parent class of any learning scheduler method.
    """
    
    def __init__(self, num_items):
        pass
    
    def next_item(self):
        pass
    
    def update(self, item, outcome):
        pass
    

class Random(Scheduler):
    """
    Scheduler that selects random items to present.
    """
    def __init(self, num_items):
        self.n = num_items
    
    def next_item(self):
        return np.random.randint(0, num_items)
    
    def update(self, item, outcome):
        pass
        
        

class Leitner(Scheduler): 
    """
    This class implements a Leitner scheduler that samples from 
    boxes with exponentially decreasing probability. Cards enter
    in box 0 and leave when they are correctly answered after entering 
    the final box
    """
    def __init__(self, nb):
        '''
        :param nb: Number of boxes
        boxes is a list of queues representing the boxes.
        dist_boxes is sampling distribution for which box to select fromr
        cards is a set of items in the boxes currently.
        '''
        self.boxes = [deque() for _ in nb]
        self.dist_boxes = np.array([1/2**i for i in range(nb)]) / sum([1/2**i for i in range(nb)])
        self.cards = set()
        
    
    def next_item(self):
        """
        Gets the next item in the learning sequence.
        """
        self.recent_box = np.random.multinomial(1, self.dist_boxes).argmax()
        
        if len(self.boxes[self.recent_box]):
            return self.boxes[self.recent_box].pop()
        else:
            return self.next_item()
    
    def update(self, item, outcome, thresh=.9):
        """
        Updates the most recent item from the sequence
        by putting it back depending on the outcome.
        """
        if outcome > thresh:
            new_box = self.recent_box + 1
            if new_box >= len(self.boxes):
                self.cards.remove(item)
            else:
                self.boxes[new_box].appendleft(item)
        else:
            new_box = max(self.recent_box - 1, 0)    
            self.boxes[new_box].appendleft(item)
        

## Data Exploration

We have about 5 million english learners, 3 million Spanish 1.9 million French and 1.4 million German learners. Italian and Portugese each have hundreds of thousands. It would be useful to restrict out studies to just the English users so we reduce the dimensionality of our dataset.

Interestingly this dataset doesn't contain any Germans learning English so our studies will consist of using the Spanish, French and Italians.

In [53]:
df.head()

Unnamed: 0,timestamp_x,delta,user_id,learning_language,ui_language,lexeme_id,history_seen,history_correct,session_seen,session_correct,session,difficulty,user_lex_hash,timestamp_y,sess_diff
0,1362703508,161682,4683705,1,2,768495,5,3,1,1,36,0.928567,-9223367538795580404,1362541826,7
1,1362704161,653,4683705,1,2,768495,6,4,2,2,37,0.928567,-9223367538795580404,1362541826,1
2,1362705368,1207,4683705,1,2,768495,8,6,3,3,38,0.928567,-9223367538795580404,1362541826,1
3,1362705749,381,4683705,1,2,768495,11,9,1,1,39,0.928567,-9223367538795580404,1362541826,1
4,1362706077,328,4683705,1,2,768495,12,10,3,2,40,0.928567,-9223367538795580404,1362541826,1


In [55]:
tbl = df.groupby('lexeme_id').count()

In [56]:
tbl

Unnamed: 0_level_0,timestamp_x,delta,user_id,learning_language,ui_language,history_seen,history_correct,session_seen,session_correct,session,difficulty,user_lex_hash,timestamp_y,sess_diff
lexeme_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
19,1,1,1,1,1,1,1,1,1,1,1,1,1,1
320,21,21,21,21,21,21,21,21,21,21,21,21,21,21
359,1,1,1,1,1,1,1,1,1,1,1,1,1,1
363,202,202,202,202,202,202,202,202,202,202,202,202,202,202
375,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999624,23,23,23,23,23,23,23,23,23,23,23,23,23,23
999636,6,6,6,6,6,6,6,6,6,6,6,6,6,6
999748,64,64,64,64,64,64,64,64,64,64,64,64,64,64
999912,216,216,216,216,216,216,216,216,216,216,216,216,216,216


In [57]:
nums = tbl['timestamp_x']

In [59]:
max(nums)

94842

In [78]:
tbl = df.groupby('lexeme_id').count()

nums = tbl['timestamp_x']

idxs = nums < 1000

print(sum(idxs))

print(sum(nums[idxs]))

print(sum(nums[~idxs]))

14033


In [79]:
print(sum(nums[idxs]))

1185744


In [80]:
print(sum(nums[~idxs]))

5810288


15179