In [1]:
import argparse
import csv
import gzip
import math
import os
import random
import sys

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math
from sys import intern
from collections import defaultdict, namedtuple

In [3]:
import duolingo as duo

In [4]:
# various constraints on parameters and outputs
MIN_HALF_LIFE = 15.0 / (24 * 60)    # 15 minutes
MAX_HALF_LIFE = 274.                # 9 months
LN2 = math.log(2.)

weights = defaultdict(float)
fcounts = defaultdict(int)
lrate=.001
hlwt=.01
l2wt=.1
sigma=1.


def pclip(p):
    # bound min/max model predictions (helps with loss optimization)
    return min(max(p, 0.0001), .9999)


def hclip(h):
    # bound min/max half-life
    return min(max(h, MIN_HALF_LIFE), MAX_HALF_LIFE)


def predict( inst, base=2.):
    h = halflife(inst, base)
    p = 2. ** (-inst.t/h)
    return pclip(p), h
    
def halflife(inst, base):
    try:
        dp = sum([weights[k]*x_k for (k, x_k) in inst.fv])
        return hclip(base ** dp)
    except:
        return MAX_HALF_LIFE


def train_update(inst):
        base = 2.
        p, h = predict(inst, base)
        dlp_dw = 2.*(p-inst.p)*(LN2**2)*p*(inst.t/h)
        dlh_dw = 2.*(h-inst.h)*LN2*h
        for (k, x_k) in inst.fv:
            rate = (1./(1+inst.p)) * lrate / math.sqrt(1 + fcounts[k])
            # rate = self.lrate / math.sqrt(1 + self.fcounts[k])
            # sl(p) update
            weights[k] -= rate * dlp_dw * x_k
            # sl(h) update
            # L2 regularization update
            weights[k] -= rate * l2wt * weights[k] / sigma**2
            # increment feature count for learning rate
            fcounts[k] += 1
            
            
def read_data(input_file, method, omit_bias=False, omit_lexemes=False, max_lines=None):
    # data = pd.read_csv(input_file)
    instances = []
    for i, row in data.iterrows():
        if max_lines is not None and i >= max_lines:
            break
        p = pclip(float(row['p_recall']))
        t = float(row['delta']) / (60 * 60 * 24)  # convert time delta to days
        h = hclip(-t / (math.log(p, 2)))
        lang = '%s->%s' % (row['ui_language'], row['learning_language'])
        lexeme_string = row['lexeme_string']
        timestamp = int(row['timestamp'])
        user_id = row['user_id']
        seen = int(row['history_seen'])
        right = int(row['history_correct'])
        wrong = seen - right
        fv = []
        if method == 'leitner':
            fv.append(('diff', right-wrong))
        elif method == 'pimsleur':
            fv.append(('total', right+wrong))
        else:
            fv.append(('right', math.sqrt(1+right)))
            fv.append(('wrong', math.sqrt(1+wrong)))
        if method == 'lr':
            fv.append(('time', t))
        if not omit_bias:
            fv.append(('bias', 1.))
        if not omit_lexemes:
            fv.append(('%s:%s' % (row['learning_language'], lexeme_string), 1.))
        instances.append(Instance(p, t, fv, h, (right+2.) / (seen+4.), lang, right, wrong, timestamp, user_id, lexeme_string))
    splitpoint = int(0.9 * len(instances))
    return instances[:splitpoint], instances[splitpoint:]

In [31]:
data = pd.read_csv( 'learning_traces.13m_en_10.csv' )

In [32]:
df = data.sample(n=1000, random_state=42)
df

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
67736,0.5,1362238751,3633081,u:dv9l,en,es,b1f3ddfd62c6bf5e6ef320e10c6b5c7b,europe/europe<np><loc><sg>,6,4,2,1
209424,1.0,1362510201,462,u:gZip,en,es,7489751fe9a37e9c8f03e2e4562b8d03,in/in<pr>,8,8,1,1
446627,1.0,1362094794,78115,u:iegs,en,es,7872afde4b6afddaad589ab644dd8d48,sees/see<vblex><pri><p3><sg>,10,9,1,1
141119,1.0,1362267676,95400,u:feOc,en,es,b968b069e4e2c04848e9f8924e34c031,we/prpers<prn><subj><p1><mf><pl>,40,40,1,1
50332,1.0,1362183822,7860,u:dTHC,en,es,928787744a962cd4ec55c1b22cedc913,eats/eat<vblex><pri><p3><sg>,53,42,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
454337,0.5,1362322756,2287,u:ih9Y,en,es,6df7cec9e2fb3502775ccacbea9f1b59,though/though<cnjsub>,2,2,2,1
3241,1.0,1362397491,17,u:Mh8,en,es,ca6a70bc7cd433ff43601c77d470d0b4,figure/figure<n><sg>,6,4,2,2
448452,1.0,1362883254,187,u:ifMk,en,es,01669d9689c5cf2ec04c58a893f02ab6,drinks/drink<vblex><pri><p3><sg>,29,27,1,1
70989,0.0,1362517638,1736,u:dzG6,en,es,4d380ab993dc7a36d9fcaf725faad8d8,flute/flute<n><sg>,4,3,2,0


In [7]:
df['p'] = df['p_recall'].apply(pclip).astype(float)
df['t'] = df['delta'].apply(lambda x: float(x)/(60*60*24))  # convertir a días
df['h'] = df.apply(lambda row: hclip(-row['t'] / np.log2(row['p'])), axis=1)
df['right'] = np.sqrt(1 + df['history_correct'])
df['wrong'] = np.sqrt(1 + df['history_seen'] - df['history_correct'])
df['lexeme_feature'] = df.apply(lambda row: f"{row['learning_language']}:{row['lexeme_string']}", axis=1)
splitpoint = int(0.9 * len(df))
trainset = df.iloc[:splitpoint]
testset = df.iloc[splitpoint:]

In [8]:
trainset

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct,p,t,h,right,wrong,lexeme_feature
67736,0.500000,1362238751,3633081,u:dv9l,en,es,b1f3ddfd62c6bf5e6ef320e10c6b5c7b,europe/europe<np><loc><sg>,6,4,2,1,0.500000,42.049549,42.049549,2.236068,1.732051,en:europe/europe<np><loc><sg>
209424,1.000000,1362510201,462,u:gZip,en,es,7489751fe9a37e9c8f03e2e4562b8d03,in/in<pr>,8,8,1,1,0.999900,0.005347,37.062267,3.000000,1.000000,en:in/in<pr>
446627,1.000000,1362094794,78115,u:iegs,en,es,7872afde4b6afddaad589ab644dd8d48,sees/see<vblex><pri><p3><sg>,10,9,1,1,0.999900,0.904109,274.000000,3.162278,1.414214,en:sees/see<vblex><pri><p3><sg>
141119,1.000000,1362267676,95400,u:feOc,en,es,b968b069e4e2c04848e9f8924e34c031,we/prpers<prn><subj><p1><mf><pl>,40,40,1,1,0.999900,1.104167,274.000000,6.403124,1.000000,en:we/prpers<prn><subj><p1><mf><pl>
50332,1.000000,1362183822,7860,u:dTHC,en,es,928787744a962cd4ec55c1b22cedc913,eats/eat<vblex><pri><p3><sg>,53,42,2,2,0.999900,0.090972,274.000000,6.557439,3.464102,en:eats/eat<vblex><pri><p3><sg>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,1.000000,1362873122,162693,u:-b,en,es,cae56c108bf713f94982fcb37423db03,eat/eat<vblex><inf>,73,62,1,1,0.999900,1.883021,274.000000,7.937254,3.464102,en:eat/eat<vblex><inf>
13557,1.000000,1362158371,368,u:bkiW,en,es,15355f65d8b00bb5a1b55b31b859b2a4,<*sf>/difference<n><*numb>,296,279,1,1,0.999900,0.004259,29.521459,16.733201,4.242641,en:<*sf>/difference<n><*numb>
491310,1.000000,1362455931,89750,u:iyqu,en,es,f19bad09b7354d4cf02403544519d849,dinner/dinner<n><sg>,3,3,1,1,0.999900,1.038773,274.000000,2.000000,1.000000,en:dinner/dinner<n><sg>
411108,0.666667,1362939368,488062,u:iQOZ,en,es,9443f902cdcf9f4da3b5f58d3f657efe,you/prpers<@ij:thank_you>,2,2,3,2,0.666667,5.648866,9.656801,1.732051,1.000000,en:you/prpers<@ij:thank_you>


In [5]:
Instance = namedtuple('Instance', 'p t fv h a lang right wrong ts uid lexeme'.split())

In [6]:
# Suponiendo que la función pclip y hclip están definidas y que Instance es un namedtuple como en tu código.

instances = []

# Instancia 1 - manzana
p = pclip(0.75)
t = 2 / (60 * 60 * 24)  # 2 días
h = hclip(-t / math.log(p, 2))
fv = [('right', math.sqrt(1 + 9)), ('wrong', math.sqrt(1 + 3))]  # 9 correctas, 3 incorrectas
instances.append(Instance(p, t, fv, h, (9 + 2) / (12 + 4), 'español', 1, 0, 1651015200, 'user123', 'manzana'))

# Instancia 2 - sol
p = pclip(0.60)
t = 3 / (60 * 60 * 24)  # 3 días
h = hclip(-t / math.log(p, 2))
fv = [('right', math.sqrt(1 + 11)), ('wrong', math.sqrt(1 + 4))]  # 11 correctas, 4 incorrectas
instances.append(Instance(p, t, fv, h, (11 + 2) / (15 + 4), 'español', 0, 2, 1651020300, 'user456', 'sol'))

In [9]:
instances[ 0 ]

Instance(p=0.75, t=2.3148148148148147e-05, fv=[('right', 3.1622776601683795), ('wrong', 2.0)], h=0.010416666666666666, a=0.6875, lang='español', right=1, wrong=0, ts=1651015200, uid='user123', lexeme='manzana')

In [37]:
# Iteramos sobre los elementos de inst.fv, que son pares (k, x_k)
for (k, x_k) in inst.fv:
    # Obtenemos el peso correspondiente de self.weights
    peso_k = self.weights[k]
    
    # Calculamos el producto y lo sumamos a dp
    dp += peso_k * x_k

NameError: name 'self' is not defined

In [38]:
for inst in instances:
    for k, x_k in inst.fv:
        print( f'K: { k }' )
        print( f'X_k: { x_k }' )
        print( '-------------' )

K: right
X_k: 3.1622776601683795
-------------
K: wrong
X_k: 2.0
-------------
K: right
X_k: 3.4641016151377544
-------------
K: wrong
X_k: 2.23606797749979
-------------


In [42]:
for inst in instances:
    print( inst.fv )
    print( '------' )

[('right', 3.1622776601683795), ('wrong', 2.0)]
------
[('right', 3.4641016151377544), ('wrong', 2.23606797749979)]
------


In [43]:
for inst in instances:
    print( inst.p )

0.75
0.6


In [12]:
for inst in instances:
    # print( -inst.t )
    train_update( inst )

In [13]:
for inst in instances:
    print( inst )

Instance(p=0.75, t=2.3148148148148147e-05, fv=[('right', 3.1622776601683795), ('wrong', 2.0)], h=0.010416666666666666, a=0.6875, lang='español', right=1, wrong=0, ts=1651015200, uid='user123', lexeme='manzana')
Instance(p=0.6, t=3.472222222222222e-05, fv=[('right', 3.4641016151377544), ('wrong', 2.23606797749979)], h=0.010416666666666666, a=0.6842105263157895, lang='español', right=0, wrong=2, ts=1651020300, uid='user456', lexeme='sol')


In [14]:
weights

defaultdict(float,
            {'right': -3.046603650754234e-08, 'wrong': -1.953477198544493e-08})

In [15]:
df

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct,p,t,h,right,wrong,lexeme_feature
67736,0.5,1362238751,3633081,u:dv9l,en,es,b1f3ddfd62c6bf5e6ef320e10c6b5c7b,europe/europe<np><loc><sg>,6,4,2,1,0.5000,42.049549,42.049549,2.236068,1.732051,en:europe/europe<np><loc><sg>
209424,1.0,1362510201,462,u:gZip,en,es,7489751fe9a37e9c8f03e2e4562b8d03,in/in<pr>,8,8,1,1,0.9999,0.005347,37.062267,3.000000,1.000000,en:in/in<pr>
446627,1.0,1362094794,78115,u:iegs,en,es,7872afde4b6afddaad589ab644dd8d48,sees/see<vblex><pri><p3><sg>,10,9,1,1,0.9999,0.904109,274.000000,3.162278,1.414214,en:sees/see<vblex><pri><p3><sg>
141119,1.0,1362267676,95400,u:feOc,en,es,b968b069e4e2c04848e9f8924e34c031,we/prpers<prn><subj><p1><mf><pl>,40,40,1,1,0.9999,1.104167,274.000000,6.403124,1.000000,en:we/prpers<prn><subj><p1><mf><pl>
50332,1.0,1362183822,7860,u:dTHC,en,es,928787744a962cd4ec55c1b22cedc913,eats/eat<vblex><pri><p3><sg>,53,42,2,2,0.9999,0.090972,274.000000,6.557439,3.464102,en:eats/eat<vblex><pri><p3><sg>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454337,0.5,1362322756,2287,u:ih9Y,en,es,6df7cec9e2fb3502775ccacbea9f1b59,though/though<cnjsub>,2,2,2,1,0.5000,0.026470,0.026470,1.732051,1.000000,en:though/though<cnjsub>
3241,1.0,1362397491,17,u:Mh8,en,es,ca6a70bc7cd433ff43601c77d470d0b4,figure/figure<n><sg>,6,4,2,2,0.9999,0.000197,1.363763,2.236068,1.732051,en:figure/figure<n><sg>
448452,1.0,1362883254,187,u:ifMk,en,es,01669d9689c5cf2ec04c58a893f02ab6,drinks/drink<vblex><pri><p3><sg>,29,27,1,1,0.9999,0.002164,15.001394,5.291503,1.732051,en:drinks/drink<vblex><pri><p3><sg>
70989,0.0,1362517638,1736,u:dzG6,en,es,4d380ab993dc7a36d9fcaf725faad8d8,flute/flute<n><sg>,4,3,2,0,0.0001,0.020093,0.010417,2.000000,1.414214,en:flute/flute<n><sg>


In [7]:
train, test = duo.read_data( df, method = 'hlr' )

In [8]:
model = duo.SpacedRepetitionModel(method = 'hlr')

In [9]:
model.train( train )

In [10]:
model.eval(test, 'test')

test	33404.6 (p=45.4, h=33359.1, l2=0.1)	mae(p)=0.517	cor(p)=0.064	mae(h)=132.828	cor(h)=0.146


In [12]:
model.dump_weights('model_weights.txt')
model.dump_predictions('model_predictions.txt', test)

In [65]:
MIN_HALF_LIFE = 15.0 / (24 * 60)    # 15 minutes
MAX_HALF_LIFE = 274. 
LN2 = math.log(2.)

class HalfLifeRegressionModel(object):
    """
    Half-life regression model using Pandas and NumPy.
    """    
    
    def pclip(p):
        # bound min/max model predictions (helps with loss optimization)
        return min(max(p, 0.0001), .9999)


    def hclip(h):
        # bound min/max half-life
        return min(max(h, MIN_HALF_LIFE), MAX_HALF_LIFE)
    
    def __init__(self, num_features, lrate=.001, hlwt=.01, l2wt=.1, sigma=1.):
        self.weights = np.zeros(num_features)
        self.fcounts = np.zeros(num_features)
        self.lrate = lrate
        self.hlwt = hlwt
        self.l2wt = l2wt
        self.sigma = sigma

    def halflife(self, features):
        
        print( f'Weights { len( self.weights ) }' )
        print( f'features { len( features ) }' )
        
        dp = np.dot(self.weights, features)
        return hclip(2 ** dp)

    def predict(self, features):
        h = self.halflife(features)
        p = 2. ** (-features['t'] / h)
        return pclip(p), h

    def train_update(self, inst):
        features = inst.drop(['p', 'h'])  # Assuming 'p' and 'h' are in the DataFrame
        p, h = self.predict(features)
        dlp_dw = 2. * (p - inst['p']) * (LN2 ** 2) * p * (inst['t'] / h)
        dlh_dw = 2. * (h - inst['h']) * LN2 * h
        self.weights -= self.lrate * (dlp_dw * features + self.hlwt * dlh_dw * features + self.l2wt * self.weights / self.sigma ** 2)
        self.fcounts += 1

    def train(self, trainset):
        # trainset = trainset.sample(frac=1)  # Shuffle the DataFrame
        for _, inst in trainset.iterrows():
            self.train_update(inst)

In [90]:
import numpy as np
import math

MIN_HALF_LIFE = 15.0 / (24 * 60)    # 15 minutes
MAX_HALF_LIFE = 274. 
LN2 = math.log(2.)

class HalfLifeRegressionModel(object):
    """
    Half-life regression model using Pandas and NumPy.
    """    

    @staticmethod
    def pclip(p):
        # bound min/max model predictions (helps with loss optimization)
        return min(max(p, 0.0001), .9999)

    @staticmethod
    def hclip(h):
        # bound min/max half-life
        return min(max(h, MIN_HALF_LIFE), MAX_HALF_LIFE)
    
    def __init__(self, num_features, lrate=.001, hlwt=.01, l2wt=.1, sigma=1.):
        self.weights = np.zeros(num_features)
        self.fcounts = np.zeros(num_features)
        self.lrate = lrate
        self.hlwt = hlwt
        self.l2wt = l2wt
        self.sigma = sigma

    def halflife(self, features):
        dp = np.dot(self.weights, features)
        return HalfLifeRegressionModel.hclip(2 ** dp)

    def predict(self, features):
        h = self.halflife(features)
        p = 2. ** (-features['t'] / h)
        return HalfLifeRegressionModel.pclip(p), h

    def train_update(self, inst):
        features = inst.drop(['p', 'h'])  # Assuming 'p' and 'h' are in the DataFrame
        p, h = self.predict(features)
        dlp_dw = 2. * (p - inst['p']) * (LN2 ** 2) * p * (inst['t'] / h)
        dlh_dw = 2. * (h - inst['h']) * LN2 * h
        self.weights -= self.lrate * (dlp_dw * features + self.hlwt * dlh_dw * features + self.l2wt * self.weights / self.sigma ** 2)
        for k in features.keys():
            self.fcounts[k] += 1

    def train(self, trainset):
        for _, inst in trainset.iterrows():
            self.train_update(inst)


In [91]:
import pandas as pd
import numpy as np

# Creando un pequeño conjunto de datos ficticios para el modelo HLR

np.random.seed(0)  # Para reproducibilidad

# Número de interacciones de usuario ficticias a generar
num_entries = 10

# Generando datos ficticios
data = {
    "p_recall": np.random.rand(num_entries),  # Probabilidad simulada de recuerdo
    "delta": np.random.randint(1, 1000, num_entries),  # Tiempo en segundos desde la última revisión
    "user_id": [f'user_{i}' for i in np.random.randint(1, 5, num_entries)],  # IDs de usuario ficticios
    "timestamp": pd.date_range(start='2023-01-01', periods=num_entries, freq='D').astype('int64') / 10**9,  # Timestamps ficticios
    "history_seen": np.random.randint(1, 20, num_entries),  # Total de veces visto
    "history_correct": np.random.randint(0, 20, num_entries),  # Total de veces respondido correctamente
    "session_seen": np.random.randint(1, 5, num_entries),  # Veces visto en la sesión actual
    "session_correct": np.random.randint(0, 5, num_entries),  # Veces respondido correctamente en la sesión actual
    "learning_language": np.random.choice(['Spanish', 'French', 'German'], num_entries),  # Idioma de aprendizaje
    "ui_language": np.random.choice(['English', 'Spanish'], num_entries),  # Idioma de la interfaz
    "lexeme_id": [f'lex_{i}' for i in np.random.randint(1, 100, num_entries)],  # IDs de lexemas ficticios
    "lexeme_string": [f'word_{i}' for i in np.random.randint(1, 100, num_entries)]  # Cadenas de lexemas ficticios
}

# Creando un DataFrame de pandas
df = pd.DataFrame(data)

df.head()  # Mostrando las primeras filas del conjunto de datos generado

Unnamed: 0,p_recall,delta,user_id,timestamp,history_seen,history_correct,session_seen,session_correct,learning_language,ui_language,lexeme_id,lexeme_string
0,0.548814,487,user_2,1672531000.0,19,3,1,1,French,English,lex_86,word_1
1,0.715189,552,user_1,1672618000.0,4,11,2,1,German,English,lex_49,word_51
2,0.602763,88,user_4,1672704000.0,18,18,4,1,Spanish,Spanish,lex_50,word_37
3,0.544883,175,user_1,1672790000.0,15,2,1,3,Spanish,English,lex_70,word_35
4,0.423655,601,user_4,1672877000.0,8,0,2,3,German,Spanish,lex_42,word_49


In [92]:
instances = []
for index, row in df.iterrows():
    p = pclip(float(row['p_recall']))
    t = float(row['delta'])/(60*60*24)  # convert time delta to days
    h = hclip(-t/(math.log(p, 2)))
    lang = '%s->%s' % (row['ui_language'], row['learning_language'])
    lexeme_id = row['lexeme_id']
    lexeme_string = row['lexeme_string']
    timestamp = int(row['timestamp'])
    user_id = row['user_id']
    seen = int(row['history_seen'])
    right = int(row['history_correct'])
    wrong = seen - right
    right_this = int(row['session_correct'])
    wrong_this = int(row['session_seen']) - right_this
    # feature vector is a list of (feature, value) tuples
    fv = []
    
    fv.append((intern('right'), math.sqrt(1+right)))
    fv.append((intern('wrong'), math.sqrt(1+wrong)))

ValueError: math domain error

In [93]:
model = SpacedRepetitionModel( method = 'hlr' )

In [81]:
model.train(trainset)

KeyError: 2

In [77]:
# Ejemplo de uso de la clase HalfLifeRegressionModel

# Número de características (excluyendo 'p', 't' y 'h')
num_features = 3

# Crear una instancia del modelo
model = HalfLifeRegressionModel(num_features=num_features)

# Preparar el conjunto de entrenamiento (aquí usamos todo el DataFrame como ejemplo)
trainset = df

# Entrenar el modelo
model.train(trainset)

# Realizar predicciones (esto es solo un ejemplo, normalmente lo harías con un conjunto de prueba)
for _, row in df.iterrows():
    features = row[['feature1', 'feature2']].values  # Extraer solo las características
    predicted_p, predicted_h = model.predict(features)
    print(f"Predicted p: {predicted_p}, Predicted h: {predicted_h}")

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [44]:
data        = pd.read_csv( 'subset_1000.csv' )
dummies     = pd.get_dummies( data[ 'lexeme' ], prefix = 'cat', dtype=float )
dummies_col = dummies.columns.to_list()
df          = pd.concat( [ data, dummies ], axis = 1 )

pred_vars = [ 'right', 'wrong', 'bias' ]
dummies_  = dummies_col + pred_vars

In [45]:
np.random.seed(42)

num_datos = len(df)

# Generar índices aleatorios
indices_aleatorios = np.random.permutation(num_datos)
tamano_entrenamiento = int(0.7 * num_datos)
indices_entrenamiento = indices_aleatorios[:tamano_entrenamiento]
indices_prueba = indices_aleatorios[tamano_entrenamiento:]

# Crear conjuntos de entrenamiento y prueba usando iloc
X_train = df.iloc[indices_entrenamiento]
X_test = df.iloc[indices_prueba]

In [46]:
features = [var for var in dummies_ if var not in ['t', 'p']]

In [47]:
n_feat = len( features )
n_feat

373