In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import deque 
import os

## Load and clean the data

In [30]:
lang_map = {'de' : 0, 'en': 1, 'es': 2, 'fr': 3, 'it': 4, 'pt': 5}

if not os.path.exists("data/cleaned.csv"):
    df = pd.read_csv("data/settles.acl16.learning_traces.13m.csv")

    df.sort_values(by=['user_id', 'lexeme_id', 'timestamp'], inplace=True)
    
    

    #Hash lexemes for smaller storage
    df['lexeme_id'] = df['lexeme_id'].apply(hash) % 1000000
    
    #Hash user id's for smaller storage
    df['user_id'] = df['user_id'].apply(hash) % 5000000
    
    #Map languages to numbers for smaller storage
    df['learning_language'] = df['learning_language'].map(lang_map)
    df['ui_language'] = df['ui_language'].map(lang_map)
    
    for c in df.columns:
        if c != 'lexeme_string':
            df[c] = pd.to_numeric(df[c], downcast='unsigned')
    
    
    
    
    df['lexeme_string'] = df.lexeme_string.map(lambda x: x[0: x.find('<')])
    
    
    df_small = df.loc[:, ['lexeme_id', 'lexeme_string']]
    df_small = df_small.drop_duplicates()
    df_small.to_csv("lexeme_map.csv", index=False)
    
    #Drop this column as it's inferred from last two
    df = df.drop(["p_recall", "lexeme_string"], axis=1)
    df.to_csv("data/cleaned.csv", index=False)
    
else:
    df = pd.read_csv("data/cleaned.csv")
    for c in df.columns:
        if c != 'lexeme_string':
            df[c] = pd.to_numeric(df[c], downcast='unsigned')
            
    lexeme_map = pd.read_csv("data/lexeme_map.csv")
    

### If data already cleaned run me instead

## Scheduling Simulator

In [None]:
class Scheduler:
    """
    Parent class of any learning scheduler method.
    """
    
    def __init__(self, num_items):
        pass
    
    def next_item(self):
        pass
    
    def update(self, item, outcome):
        pass
    

class Random(Scheduler):
    """
    Scheduler that selects random items to present.
    """
    def __init(self, num_items):
        self.n = num_items
    
    def next_item(self):
        return np.random.randint(0, num_items)
    
    def update(self, item, outcome):
        pass
        
        

class Leitner(Scheduler): 
    """
    This class implements a Leitner scheduler that samples from 
    boxes with exponentially decreasing probability. Cards enter
    in box 0 and leave when they are correctly answered after entering 
    the final box
    """
    def __init__(self, nb):
        '''
        :param nb: Number of boxes
        boxes is a list of queues representing the boxes.
        dist_boxes is sampling distribution for which box to select fromr
        cards is a set of items in the boxes currently.
        '''
        self.boxes = [deque() for _ in nb]
        self.dist_boxes = np.array([1/2**i for i in range(nb)]) / sum([1/2**i for i in range(nb)])
        self.cards = set()
        
    
    def next_item(self):
        """
        Gets the next item in the learning sequence.
        """
        self.recent_box = np.random.multinomial(1, self.dist_boxes).argmax()
        
        if len(self.boxes[self.recent_box]):
            return self.boxes[self.recent_box].pop()
        else:
            return self.next_item()
    
    def update(self, item, outcome, thresh=.9):
        """
        Updates the most recent item from the sequence
        by putting it back depending on the outcome.
        """
        if outcome > thresh:
            new_box = self.recent_box + 1
            if new_box >= len(self.boxes):
                self.cards.remove(item)
            else:
                self.boxes[new_box].appendleft(item)
        else:
            new_box = max(self.recent_box - 1, 0)
            
            self.boxes[new_box].appendleft(item)
        
            


In [28]:
df.head()

Unnamed: 0,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,history_seen,history_correct,session_seen,session_correct
0,1362206313,8337322,4665602,0,1,802707,3,3,1,0
1,1362206313,16777591,4665602,0,1,911794,2,1,1,1
2,1362206313,19628054,4665602,0,1,952672,15,14,1,1
3,1362206313,8346489,4665602,0,1,885209,11,11,1,1
4,1362206313,6842117,4665602,0,1,922432,2,2,1,1


In [40]:
df = df.drop('lexeme_string', axis=1)

In [None]:
df.to_csv("data/cleaned.csv", index=False)