## 1. Load libraries and data

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## 2. Define functions

https://github.com/duolingo/halflife-regression

Funciones

1. **estimate_h_hat**: Permite estimar h_hat dados los valores de theta y x. Ecuación N. 2. 
2. **estimate_p_hat**: Permite estimar p_hat dados los valores de delta y h_hat. Ecuación N. 4.
3. **hh_loss_function**: Permite estimar el costo de la función tomando en cuenta el valor de p, p_hat, h_hat, delta y lambda. Ecuación N. 9. 
4. **gradient_partial**: Permite estimar el gradiente de la función de costo. Ecuación N. 10.
5. **adagrad_update**: Permite actualizar el valor de theta con cada iteración utilizando el algoritmo SGD adagrad. Ecuación N. 11.
6. **half_life_regression**: Modelo HLR que integra todas las funciones anteriores. 

In [130]:
def estimate_h_hat( theta, x ):
    
    '''
    Objetivo:
        - Estimar hat{h} a través de la ecuación N. 2:
          \hat{h} = 2^{Theta \cdot x}
          
    Input:
        - theta : valor de los coeficientes de x
        - x     : variables predictoras
        
    Output:
        - estimated_half_life: hat{h}
    '''
    estimated_h = 2 ** np.dot( theta.T, x )
    
    return estimated_h

In [131]:
def estimate_p_hat( delta, estimated_h ):
    
    '''
    Objetivo:
        - Estimar  hat{p} a través de la ecuación N. 4:
          \hat{p}_{\Theta} = 2^{-\Delta/\hat{h}_{\Theta}}
   
   Input:
       - delta       : tiempo transcurrido desde 
                       la última práctica
       - estimated_h : valor estimado de la capacidad
                       de memoria o half_life ( hat{h} ).
   
   Output:
       - predicted_p : valor estimado de la probabilidad
                       de recordar( hat{p} )
        
    '''
    
    predicted_p = 2 ** ( - delta / estimated_h )
    
    return predicted_p

In [132]:
def hh_loss_function( p, predicted_p, estimated_h, delta, theta, regularization, lambda_param = 0.1, alpha_param = 0.01 ):
    
    '''
    Objetivo: 
        - Calcular el valor de pérdida del modelo Half Life Regression
          Ecuación N. 9.
    
    Input:
        - p            : probabilidad de recordar real
        - predicted_p  : probabilidad de recordar predicha
                         mediante la función predict_recall_probability
        - estimated_h  : capacidad de memoria estimada
        - delta        : tiempo transcurrido desde la última práctica
        - theta        : valor de los coeficientes de x
        - lambda_param : parámetro lambda de importancia relativa de la
                         semivida en la función de pérdida
        - alpha_param  : parámetro de regularización L2
        
    Output:
        - Valor de pérdida de la función Half Life Regression
    '''
    
    loss_p              = np.square( p - predicted_p )
    # loss_p = np.square(p - predicted_p.reshape(p.shape))
    loss_h              = np.square( ( -delta / np.log2( p ) ) - estimated_h )
    
    if regularization == 'l2': 
    
        regularization_term = lambda_param * np.sum( np.square( theta ) )
        
    elif regularization == 'l1':
        
        regularization_term = lambda_param * np.sum( np.abs( theta ) )        
    
    loss = loss_p + alpha_param * loss_h + regularization_term  

    return loss

In [133]:
def gradient_partial( p, predicted_p, estimated_h, delta, x, theta, lambda_param = 0.1, alpha_param = 0.01 ):
    
    '''
    Calcula la derivada parcial de la función de pérdida con respecto a cada peso theta_k.

    Input:
    - p            : probabilidad de recordar real
    - predicted_p  : probabilidad de recordar predicha
    - estimated_h  : capacidad de memoria estimada
    - delta        : tiempo transcurrido desde la última práctica
    - x            : vector de características
    - theta        : vector de pesos
    - lambda_param : parámetro lambda de importancia relativa de la semivida en la función de pérdida
    - alpha_param  : parámetro de regularización L2

    Output:
    - gradient : vector de derivadas parciales con respecto a cada theta_k
    '''
    
    term1 = 2 * ( predicted_p - p ) * np.log( 2 ) * predicted_p * ( 2**( -delta / estimated_h ) ) * x
    term2 = 2 * alpha_param * ( estimated_h + delta / np.log2( p ) ) * np.log( 2 ) * estimated_h * x
    term3 = 2 * lambda_param * theta

    gradient = term1 + term2 + term3
    
    return gradient

In [134]:
def adagrad_update( theta, gradient, learning_rate, csg ):
    
    '''
    Actualiza los pesos utilizando el algoritmo AdaGrad.

    Input:
    - theta         : vector de pesos
    - gradient      : vector de derivadas parciales con respecto a cada theta_k
    - learning_rate : tasa de aprendizaje
    - csg           : acumulación de los cuadrados de los gradientes anteriores

    Output:
    - theta_updated : vector de pesos actualizado
    '''
    
    csg_updated = csg + gradient**2
    theta_updated = theta - ( learning_rate / np.sqrt( csg_updated + 1e-8 ) ) * gradient

    return theta_updated, csg_updated

In [138]:
def half_life_regression( X, y, delta, regularization, learning_rate=0.01, lambda_param=0.1, alpha_param=0.01, num_iterations = 1000 ):
    '''
    Implementa el modelo de Half-Life Regression.

    Input:
    - X               : Matriz de características (dimensiones: m x n)
    - y               : Vector de etiquetas (dimensiones: m x 1)
    - theta_init      : Vector de pesos iniciales (dimensiones: n x 1)
    - learning_rate   : Tasa de aprendizaje para el algoritmo de optimización (por defecto: 0.01)
    - lambda_param    : Parámetro lambda de importancia relativa de la semivida en la función de pérdida (por defecto: 0.1)
    - alpha_param     : Parámetro de regularización L2 (por defecto: 0.01)
    - num_iterations  : Número de iteraciones para el algoritmo de optimización (por defecto: 1000)

    Output:
    - theta_optimized : Vector de pesos optimizados
    '''

    # Inicialización de variables
    
    m     = X.shape[ 1 ]   # n_rows
    n     = X.shape[ 0 ]   # n_columns
    theta = np.zeros( ( n, 1 ) ) # weights: matriz vacía. Coeficientes. 
    
    # theta = np.random.rand(X.shape[1])
    csg = np.zeros_like(theta)

    cost_list = []
    # Iteraciones de optimización
    for iteration in range(num_iterations):
        # Predicción de la semivida y probabilidad de recordar
        estimated_h = estimate_h_hat(theta, X)       
        predicted_p = estimate_p_hat(delta, estimated_h)

        # Cálculo de la pérdida y el gradiente
        loss     = hh_loss_function(y, predicted_p, estimated_h, delta, theta, regularization, lambda_param, alpha_param)
        gradient = gradient_partial(y, predicted_p, estimated_h, delta, X, theta, lambda_param, alpha_param)

        # Actualización de pesos con AdaGrad
        theta, csg = adagrad_update(theta, gradient, learning_rate, csg)

        # Mostrar la pérdida en cada 100 iteraciones
        if iteration % 100 == 0:
            print(f"Iteration {iteration}, Loss: {loss}")
            
        cost_list.append( loss )

    return theta, cost_list

## Primer intento con datos de Duolingo

In [143]:
data      = pd.read_csv( 'subset_1000.csv' )
pred_vars = [ 'right', 'wrong', 'bias', 't' ]

X_train, X_test, Y_train, Y_test = train_test_split( data[ pred_vars ], 
                                                     data[ 'p' ], 
                                                     test_size    = 0.30,
                                                     random_state = 2023 )

t_train = X_train[ 't' ].values
X_train = X_train.values
Y_train = Y_train.values
X_test = X_test.values
Y_test = Y_test.values

X_train = X_train.T
Y_train = Y_train.reshape( 1, X_train.shape[ 1 ] )
t_train = t_train.reshape( 1, X_train.shape[ 1 ] )

X_test = X_test.T
Y_test = Y_test.reshape( 1, X_test.shape[ 1 ] )

In [144]:
theta_optimized, cost_list = half_life_regression(X_train, Y_train, t_train, regularization = 'l2' )

Iteration 0, Loss: [[2.69441250e+08 7.37541200e+00 1.70376395e+01 9.46436424e+07
  2.66597746e+00 7.46281460e+00 1.07364127e+09 1.70844538e+06
  2.53931836e+08 6.87525405e+01 4.03974965e+05 5.20247570e+09
  1.48721321e+06 3.60024115e+05 1.69106448e+07 1.43552562e+02
  7.44384413e+05 1.22296973e+02 1.06474837e+07 3.09851748e+04
  4.29282259e+05 1.21306124e+07 2.85250853e+00 4.57752001e+07
  4.38341600e+07 3.67765055e+02 8.13474452e+00 1.89424062e+01
  6.57388697e+06 4.10413696e+00 1.08991573e+01 7.09760966e+05
  2.65508407e+00 1.37983563e+05 3.27345502e+00 1.34438432e+00
  1.00638487e+00 1.16662616e-01 8.29537605e-01 4.00346583e+08
  2.11423371e+02 1.01702545e+05 2.59114016e+01 8.12607173e-02
  2.44621820e+01 6.94612159e+00 3.63106314e+00 3.38637360e+06
  5.82654744e+04 6.50238241e+05 4.93104388e-01 1.45411296e-01
  4.42341576e+05 6.78389621e-02 3.16918566e+06 1.90915319e+06
  1.01138800e-02 8.43020637e+01 2.26710879e+01 6.69472540e+00
  5.40132636e+07 1.20449317e+09 1.56118272e+01 1.94

ValueError: operands could not be broadcast together with shapes (700,700) (4,700) 

In [None]:
def optimize_theta(D, alpha, lambda_, eta, theta, X, p, Delta, max_iter=1000):
    """
    Optimize the values of theta for the spaced repetition model with safeguards against overflow and underflow.

    Parameters:
    - D (int): Number of data instances.
    - alpha (float): Weight for the half-life term in the loss function.
    - lambda_ (float): Regularization parameter.
    - eta (float): Learning rate.
    - theta (np.array): Initial theta values.
    - X (np.array): Feature vectors for each data instance.
    - p (np.array): Observed recall rates.
    - Delta (np.array): Lag times since each item was last practiced.
    - max_iter (int): Maximum number of iterations for the optimization.

    Returns:
    - np.array: Optimized theta values.
    """
    for iteration in range(max_iter):
        grad_theta = np.zeros_like(theta)
        for t in range(D):
            # Safe computation of predicted half-life and probability
            theta_x = np.clip(theta.dot(X[t]), -10, 10)  # Prevent overflow in exponent
            h_hat = 2 ** theta_x
            p_hat = 2 ** np.clip(-Delta[t] / h_hat, -10, 10)  # Prevent underflow/overflow

            # Compute theoretical half-life
            if p[t] > 0:
                h = -Delta[t] / np.log2(p[t])
            else:
                h = 0  # Assign a default value when p is 0

            # Compute the gradients
            for k in range(len(theta)):
                term1 = 2 * (p[t] - p_hat) * np.log(2) * p_hat * (2 ** (-Delta[t] / h_hat)) * X[t][k]
                term2 = 2 * alpha * (h_hat + Delta[t] / np.log2(p[t])) * np.log(2) * h_hat * X[t][k]
                term3 = 2 * lambda_ * theta[k]
                grad_theta[k] += term1 + term2 + term3

        # Update theta
        theta -= eta * grad_theta / D  # Average gradient over all instances

    return theta

In [197]:
def optimize_theta_adagrad(D, alpha, lambda_, eta, theta, X, p, Delta, max_iter=1000, epsilon=1e-8):
    """
    Optimize the values of theta for the spaced repetition model using AdaGrad.

    Parameters:
    - D (int): Number of data instances.
    - alpha (float): Weight for the half-life term in the loss function.
    - lambda_ (float): Regularization parameter.
    - eta (float): Learning rate.
    - theta (np.array): Initial theta values.
    - X (np.array): Feature vectors for each data instance.
    - p (np.array): Observed recall rates.
    - Delta (np.array): Lag times since each item was last practiced.
    - max_iter (int): Maximum number of iterations for the optimization.
    - epsilon (float): Small constant to prevent division by zero.

    Returns:
    - np.array: Optimized theta values.
    """
    # Initialize gradient accumulation
    grad_accumulation = np.zeros_like(theta)

    for iteration in range(max_iter):
        grad_theta = np.zeros_like(theta)
        for t in range(D):
            # Safe computation of predicted half-life and probability
            theta_x = np.clip(theta.dot(X[t]), -10, 10)  # Prevent overflow in exponent
            h_hat = 2 ** theta_x
            p_hat = 2 ** np.clip(-Delta[t] / h_hat, -10, 10)  # Prevent underflow/overflow

            # Compute theoretical half-life
            if p[t] > 0:
                h = -Delta[t] / np.log2(p[t])
            else:
                h = 0  # Assign a default value when p is 0

            # Compute the gradients
            for k in range(len(theta)):
                term1 = 2 * (p[t] - p_hat) * np.log(2) * p_hat * (2 ** (-Delta[t] / h_hat)) * X[t][k]
                term2 = 2 * alpha * (h_hat + Delta[t] / np.log2(p[t])) * np.log(2) * h_hat * X[t][k]
                term3 = 2 * lambda_ * theta[k]
                grad_theta[k] += term1 + term2 + term3

        # Accumulate the square of gradients
        grad_accumulation += grad_theta ** 2

        # Update theta using AdaGrad adjustment
        adjusted_eta = eta / (np.sqrt(grad_accumulation) + epsilon)
        theta -= adjusted_eta * grad_theta / D

    return theta

In [205]:
# ESTA ES LA REAL FUNCION

def optimize_theta_adagrad(D, alpha, lambda_, eta, theta, X, p, Delta, max_iter=1000, epsilon=1e-8):
    """
    Optimize the values of theta for the spaced repetition model using AdaGrad,
    and calculate the cost function explicitly at each iteration.

    Parameters:
    - D (int): Number of data instances.
    - alpha (float): Weight for the half-life term in the loss function.
    - lambda_ (float): Regularization parameter.
    - eta (float): Learning rate.
    - theta (np.array): Initial theta values.
    - X (np.array): Feature vectors for each data instance.
    - p (np.array): Observed recall rates.
    - Delta (np.array): Lag times since each item was last practiced.
    - max_iter (int): Maximum number of iterations for the optimization.
    - epsilon (float): Small constant to prevent division by zero.

    Returns:
    - np.array: Optimized theta values.
    - list: Cost per iteration.
    """
    # Initialize gradient accumulation and cost tracking
    grad_accumulation = np.zeros_like(theta)
    cost_history = []

    for iteration in range(max_iter):
        grad_theta = np.zeros_like(theta)
        cost = 0

        for t in range(D):
            # Safe computation of predicted half-life and probability
            theta_x = np.clip(theta.dot(X[t]), -10, 10)  # Prevent overflow in exponent
            h_hat = 2 ** theta_x
            p_hat = 2 ** np.clip(-Delta[t] / h_hat, -10, 10)  # Prevent underflow/overflow

            # Compute theoretical half-life
            if p[t] > 0:
                h = -Delta[t] / np.log2(p[t])
            else:
                h = 0  # Assign a default value when p is 0

            # Update cost
            cost += (p[t] - p_hat) ** 2 + alpha * (h - h_hat) ** 2

            # Compute the gradients
            for k in range(len(theta)):
                term1 = 2 * (p[t] - p_hat) * np.log(2) * p_hat * (2 ** (-Delta[t] / h_hat)) * X[t][k]
                term2 = 2 * alpha * (h_hat + Delta[t] / np.log2(p[t])) * np.log(2) * h_hat * X[t][k]
                term3 = 2 * lambda_ * theta[k]
                grad_theta[k] += term1 + term2 + term3

        # Add regularization term to cost
        cost += lambda_ * np.sum(theta ** 2)
        cost_history.append(cost / D)  # Average cost per instance

        # Accumulate the square of gradients
        grad_accumulation += grad_theta ** 2

        # Update theta using AdaGrad adjustment
        adjusted_eta = eta / (np.sqrt(grad_accumulation) + epsilon)
        theta -= adjusted_eta * grad_theta / D

    return theta, cost_history

In [207]:
# Ejemplp pequeño

# Creating a sample DataFrame to use with the optimize_theta function
data = {
    "feature1": [0.5, 0.6, 0.7, 0.8, 0.9],
    "feature2": [0.1, 0.2, 0.3, 0.4, 0.5],
    "feature3": [0.2, 0.3, 0.4, 0.5, 0.6],
    "recall_rate": [0.9, 0.8, 0.7, 0.6, 0.5],
    "lag_time": [1, 2, 3, 4, 5]
}
df = pd.DataFrame(data)

# Converting DataFrame columns to numpy arrays
X = df[["feature1", "feature2", "feature3"]].values
p = df["recall_rate"].values
Delta = df["lag_time"].values

# Set the parameters for optimization
D = len(df)  # Number of data instances
alpha = 0.5
lambda_ = 0.1
eta = 0.01
theta = np.random.randn(3)  # Initial theta values for a 3-feature model

# Using the optimize_theta function
optimized_theta = optimize_theta(D, alpha, lambda_, eta, theta, X, p, Delta)

In [208]:
optimized_theta, costo_h = optimize_theta_adagrad(D, alpha, lambda_, eta, theta, X, p, Delta)
optimized_theta

array([ 3.12879734, -0.65156256,  0.41833509])

In [209]:
costo_h

[2.687350451498462,
 2.6860919635366534,
 2.685190738215561,
 2.6844529674772537,
 2.6838135541449026,
 2.683241595510166,
 2.6827195932191605,
 2.6822365122155922,
 2.68178486651717,
 2.6813593021584743,
 2.6809558333626287,
 2.680571398022611,
 2.6802035831752513,
 2.679850447216697,
 2.6795104002073695,
 2.679182120632304,
 2.678864495902597,
 2.678556578812707,
 2.678257555016295,
 2.6779667182930114,
 2.6776834514397745,
 2.6774072112978984,
 2.677137516871791,
 2.6768739397929737,
 2.676616096587156,
 2.676363642344311,
 2.6761162654925723,
 2.6758736834493866,
 2.675635638976315,
 2.675401897103128,
 2.675172242516113,
 2.6749464773277074,
 2.6747244191615236,
 2.6745058994998905,
 2.6742907622512093,
 2.674078862502392,
 2.6738700654279635,
 2.673664245332424,
 2.6734612848064945,
 2.6732610739811107,
 2.673063509865654,
 2.672868495759105,
 2.672675940724475,
 2.672485759118463,
 2.6722978701693436,
 2.6721121975971998,
 2.671928669271427,
 2.6717472169011165,
 2.6715677757545

In [201]:
data      = pd.read_csv( 'subset_1000.csv' )
pred_vars = [ 'right', 'wrong', 'bias', 't' ]

X_train, X_test, Y_train, Y_test = train_test_split( data[ pred_vars ], 
                                                     data[ 'p' ], 
                                                     test_size    = 0.30,
                                                     random_state = 2023 )

delta = X_train[ 't' ].values
X_train = X_train.drop( columns = [ 't' ] )
X_train = X_train.values
Y_train = Y_train.values
X_test = X_test.values
Y_test = Y_test.values

# X_train = X_train.T
Y_train = Y_train.reshape( 700, )
delta = delta.reshape( 700, )

# X_test = X_test.T
Y_test = Y_test.reshape( 300, )

In [202]:
delta.shape

(700,)

In [204]:
# Set the parameters for optimization
D = 700  # Number of data instances
alpha = 0.5
lambda_ = 0.1
eta = 0.01
theta = np.random.randn(3)  # Initial theta values for a 3-feature model

# Using the optimize_theta function
optimized_theta = optimize_theta_adagrad(D, alpha, lambda_, eta, theta, X_train, Y_train, delta )
optimized_theta

array([-1.95120453, -0.15090183,  0.58920047])