In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import deque 
import os
import torch
from torch import nn
from typing import Union
from torch import optim
from torch import distributions
import itertools


## Load and clean the data

In [22]:
lang_map = {'de' : 0, 'en': 1, 'es': 2, 'fr': 3, 'it': 4, 'pt': 5}

if not os.path.exists("data/cleaned.csv"):
    df = pd.read_csv("data/settles.acl16.learning_traces.13m.csv")

    

    #Hash lexemes for smaller storage
    df['lexeme_id'] = df['lexeme_id'].apply(hash) % 1000000
    
    #Hash user id's for smaller storage
    df['user_id'] = df['user_id'].apply(hash) % 5000000
    
    #Map languages to numbers for smaller storage
    df['learning_language'] = df['learning_language'].map(lang_map)
    df['ui_language'] = df['ui_language'].map(lang_map)
    
    for c in df.columns:
        if c != 'lexeme_string':
            df[c] = pd.to_numeric(df[c], downcast='unsigned')
    
    
    
    
    df['lexeme_string'] = df.lexeme_string.map(lambda x: x[0: x.find('<')])
    
    
    df_small = df.loc[:, ['lexeme_id', 'lexeme_string']]
    df_small = df_small.drop_duplicates()
    df_small.to_csv("lexeme_map.csv", index=False)
    
    #Drop this column as it's inferred from last two
    df = df.drop(["p_recall", "lexeme_string"], axis=1)
    
    df.sort_values(by=['user_id', 'lexeme_id', 'timestamp'], inplace=True)
    
    
    df.to_csv("data/cleaned.csv", index=False)
    
else:
    df = pd.read_csv("data/cleaned.csv")
    for c in df.columns:
        if c != 'lexeme_string':
            df[c] = pd.to_numeric(df[c], downcast='unsigned')
            
    lexeme_map = pd.read_csv("data/lexeme_map.csv")
    

## Scheduling Simulator

In [23]:
class Scheduler:
    """
    Parent class of any learning scheduler method.
    """
    
    def __init__(self, num_items):
        pass
    
    def next_item(self):
        pass
    
    def update(self, item, outcome):
        pass
    

class Random(Scheduler):
    """
    Scheduler that selects random items to present.
    """
    def __init(self, num_items):
        self.n = num_items
    
    def next_item(self):
        return np.random.randint(0, num_items)
    
    def update(self, item, outcome):
        pass
        
        

class Leitner(Scheduler): 
    """
    This class implements a Leitner scheduler that samples from 
    boxes with exponentially decreasing probability. Cards enter
    in box 0 and leave when they are correctly answered after entering 
    the final box
    """
    def __init__(self, nb):
        '''
        :param nb: Number of boxes
        boxes is a list of queues representing the boxes.
        dist_boxes is sampling distribution for which box to select fromr
        cards is a set of items in the boxes currently.
        '''
        self.boxes = [deque() for _ in nb]
        self.dist_boxes = np.array([1/2**i for i in range(nb)]) / sum([1/2**i for i in range(nb)])
        self.cards = set()
        
    
    def next_item(self):
        """
        Gets the next item in the learning sequence.
        """
        self.recent_box = np.random.multinomial(1, self.dist_boxes).argmax()
        
        if len(self.boxes[self.recent_box]):
            return self.boxes[self.recent_box].pop()
        else:
            return self.next_item()
    
    def update(self, item, outcome, thresh=.9):
        """
        Updates the most recent item from the sequence
        by putting it back depending on the outcome.
        """
        if outcome > thresh:
            new_box = self.recent_box + 1
            if new_box >= len(self.boxes):
                self.cards.remove(item)
            else:
                self.boxes[new_box].appendleft(item)
        else:
            new_box = max(self.recent_box - 1, 0)    
            self.boxes[new_box].appendleft(item)
        

## Data Exploration

We have about 5 million english learners, 3 million Spanish 1.9 million French and 1.4 million German learners. Italian and Portugese each have hundreds of thousands. It would be useful to restrict out studies to just the English users so we reduce the dimensionality of our dataset.

Interestingly this dataset doesn't contain any Germans learning English so our studies will consist of using the Spanish, French and Italians.

In [98]:
delta_means = df.loc[:, ['user_id', 'delta']].groupby('user_id').mean()
delta_std = df.loc[:, ['user_id', 'delta']].groupby('user_id').std()
item_difficulties = df.groupby('lexeme_id').apply(lambda x: x['history_correct'].sum() / x['history_seen'].sum())

## Collecting Trajectories

For each user we will define a trajectory where states are $N \times 3$ where $N$ is the number of lexemes in our target dataset, and for each lexeme there's a tuple of (times_seen, times_correct, $\nabla$) where $\nabla$ is the time since the item was last seen.

In [58]:
item_counts = df_english.groupby('lexeme_id').count()

In [6]:
df_english.sort_values(by=['user_id', 'timestamp'], inplace=True)

In [41]:
Activation = Union[str, nn.Module]

_str_to_activation = {
    'relu': nn.ReLU(),
    'tanh': nn.Tanh(),
    'leaky_relu': nn.LeakyReLU(),
    'sigmoid': nn.Sigmoid(),
    'selu': nn.SELU(),
    'softplus': nn.Softplus(),
    'identity': nn.Identity(),
}

def build_mlp(
        input_size: int,
        output_size: int,
        n_layers: int,
        size: int,
        activation: Activation = 'relu',
        output_activation: Activation = 'identity') -> nn.Module:
    
    activation = _str_to_activation[activation]
    
    layers = [nn.Linear(input_size, size), activation]

    for _ in range(n_layers):
        layers.append(nn.Linear(size, size))
        layers.append(activation)

    layers.append(nn.Linear(size, output_size))
    net = nn.Sequential(*layers)
    
    return net


def from_numpy(*args, **kwargs):
    return torch.from_numpy(*args, **kwargs).float().to(device)


def to_numpy(tensor):
    return tensor.to('cpu').detach().numpy()


def init_gpu(use_gpu=True, gpu_id=0):
    global device
    if torch.cuda.is_available() and use_gpu:
        device = torch.device("cuda:" + str(gpu_id))
        print("Using GPU id {}".format(gpu_id))
    else:
        device = torch.device("cpu")
        print("GPU not detected. Defaulting to CPU.")


In [96]:
class MLP_Policy:
    def __init__(self, ac_dim, ob_dim, n_layers, size, lr, **kwargs):
        self.ob_dim = ob_dim
        self.ac_dim = ac_dim
        self.n_layers = n_layers
        self.size = size
        self.learning_rate = lr
        
        self.mean_net = build_mlp(
            input_size=self.ob_dim,
            output_size=self.ac_dim,
            n_layers=self.n_layers, size=self.size,
        )
        
        self.logstd = nn.Parameter(
            torch.zeros(self.ac_dim, dtype=torch.float32)
        )
        
        self.optimizer = optim.Adam(
            itertools.chain([self.logstd], self.mean_net.parameters()),
            self.learning_rate
        )

    def update(self, observations, actions):

        dist = self.forward(from_numpy(observations))

        expert = torch.tensor(actions)

        acs_net = dist.rsample()
        loss = self.loss(expert, acs_net)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss
        
    def run_training(self, n_iters, batch, all_obs, all_acs):
            losses = []
            for _ in range(n_iters):
                n = np.random.randint(0, len(df))
                
                obs_curr, acs_curr = obs[n:n+batch], acs[n:n+batch]
                loss_curr = self.update(obs, acs)
                print(loss_curr)
            return losses
                

In [95]:
agent = MLP_Policy(1, 3, 2, 32, 5e-3)

observations = df.loc[:, ['history_seen', 'history_correct', 'item_difficulty']]

In [None]:
agent.run_training(2000)

In [99]:
df_english.sort_values(by=['user_id', 'timestamp'], inplace=True)

Unnamed: 0,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,history_seen,history_correct,session_seen,session_correct
0,1362206313,8337322,4665602,0,1,802707,3,3,1,0
1,1362206313,16777591,4665602,0,1,911794,2,1,1,1
2,1362206313,19628054,4665602,0,1,952672,15,14,1,1
3,1362206313,8346489,4665602,0,1,885209,11,11,1,1
4,1362206313,6842117,4665602,0,1,922432,2,2,1,1


In [102]:
df.sort_values(by=['user_id', 'timestamp']).head(50)

Unnamed: 0,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,history_seen,history_correct,session_seen,session_correct
8782062,1362615149,3476,2,3,1,981238,1,1,7,7
8782069,1362615149,854,2,3,1,161200,2,2,8,7
8782056,1362958740,347670,2,3,1,361711,4,3,2,2
8782057,1362958740,347067,2,3,1,933898,7,5,2,1
8782058,1362958740,347389,2,3,1,495664,3,2,2,2
8782059,1362958740,344445,2,3,1,121049,4,4,2,2
8782060,1362958740,347670,2,3,1,975872,5,5,2,2
8782061,1362958740,347067,2,3,1,930801,6,5,1,1
8782063,1362958740,343591,2,3,1,981238,8,8,2,2
8782064,1362958740,305132,2,3,1,179176,1,1,1,0


In [121]:
def timestamp_to_session(x):
    result = pd.DataFrame()
    
    timestamps_sorted = np.array(sorted(list(set(x['timestamp']))))    
    
    result['timestamp'] = timestamps_sorted
    result['session'] = list(range(len(timestamps_sorted)))
    return result

In [123]:
df_with_timestamps = df.loc[:, ['user_id', 'timestamp']].groupby(['user_id']).apply(timestamp_to_session)

In [125]:
df_with_timestamps

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,session
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0,1362615149,0
2,1,1362958740,1
19,0,1362666731,0
53,0,1362906166,0
53,1,1362906769,1
...,...,...,...
4999979,1,1362177024,1
4999979,2,1362177277,2
4999979,3,1362177519,3
4999979,4,1363051739,4
