# **Importamos librerias**

In [None]:

! pip install git+https://github.com/maciejkula/spotlight.git

!pip install optuna

Collecting git+https://github.com/maciejkula/spotlight.git
  Cloning https://github.com/maciejkula/spotlight.git to /tmp/pip-req-build-fl2fwkl4
  Running command git clone --filter=blob:none --quiet https://github.com/maciejkula/spotlight.git /tmp/pip-req-build-fl2fwkl4
  Resolved https://github.com/maciejkula/spotlight.git to commit 75f4c8c55090771b52b88ef1a00f75bb39f9f2a9
  Preparing metadata (setup.py) ... [?25ldone


In [None]:

from google.colab import drive
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import optuna
from spotlight.cross_validation import random_train_test_split
from spotlight.evaluation import precision_recall_score
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.interactions import Interactions

# **Extraemos la data**

In [None]:
# conectamos al drive
drive.mount('/content/drive')

# Definimos el directorio
dir_data_kaggle='./drive/MyDrive/alicorp/data'

# extraemos la data

df_user=pd.read_csv(filepath_or_buffer=dir_data_kaggle+'/df_user.csv')


Downloading...
From (uriginal): https://drive.google.com/uc?id=1kVpYajCPZva4UAWa90p4Q2JHOx6RzN_-
From (redirected): https://drive.google.com/uc?id=1kVpYajCPZva4UAWa90p4Q2JHOx6RzN_-&confirm=t&uuid=c9b0391f-7826-482c-a82a-a825cdea1dc8
To: /kaggle/working/df_user.csv
100%|█████████████████████████████████████████| 163M/163M [00:00<00:00, 166MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-9Bjyvc9vzzcQGJvPS3YoZbhF9EFzPmg
To: /kaggle/working/df_product.csv
100%|██████████████████████████████████████| 31.1k/31.1k [00:00<00:00, 81.0MB/s]


# **Preliminares sobre la data**

In [None]:
# definimos el tamaño de la data

len_data=len(df_user)

# definimos las mascaras booleanas

rand_mask=np.random.rand(len_data)

cross_mask=rand_mask<0.8

val_mask=rand_mask>=0.8

# preparamos los arrays

cross_user_array=np.array(df_user['customer_id'][cross_mask])

cross_product_array=np.array(df_user['product_id'][cross_mask])

val_user_array=np.array(df_user['customer_id'][val_mask])

val_product_array=np.array(df_user['product_id'][val_mask])


# interaciones del cross validation
cross_interaction_matrix=Interactions(user_ids=cross_user_array,
                                item_ids=cross_product_array)

# interaciones de la validacion

val_interaction_matrix=Interactions(user_ids=val_user_array,
                                item_ids=val_product_array)



# **Construimos la funcion objetivo**

In [None]:
def objective(trial):

    # definimos el rango de las variables

  embedding_dim = trial.suggest_int('embedding_dim', 8,256, step=8)
  n_iter= trial.suggest_int('n_iter',5,100, step=5)
  batch_size=trial.suggest_int('batch_size',32,512, step=32)
  l2=trial.suggest_float('l2',10e-24,10e-8, log=True)
  learning_rate=trial.suggest_float('learning_rate',10e-8,10e-1, log=True)
  num_negative_samples=trial.suggest_int('num_negative_samples',5,50, step=5)


    # picamos las interaciones

  (cross_train,cross_test)=random_train_test_split( interactions=cross_interaction_matrix,
                                        test_percentage=0.2)

    # definimos el modelo

  model=ImplicitFactorizationModel(

                                 loss='adaptive_hinge',
                                 embedding_dim=embedding_dim,
                                 n_iter=n_iter,
                                 batch_size=batch_size,
                                 l2=l2,
                                 learning_rate=learning_rate,
                                 num_negative_samples=num_negative_samples,


                                 use_cuda=True)

    # entrenamos el modelo


  model.fit(interactions=cross_train,verbose=False)

    # evaluamios el modelo

  cross_recall=precision_recall_score(model,cross_test,k=30)[1]

  return cross_recall.mean()


# **Iniciamos el proceso de busqueda de parametros**

In [None]:
#Hacemos un estudio

study_alicorp= optuna.create_study(

    study_name='opt_alicorp',
    direction='maximize',
    sampler=optuna.samplers.CmaEsSampler(
        x0=joblib.load(dir_data_kaggle+'/alicorp_params_x0.sav')
                                                )
                                                    )

study_alicorp.optimize(func=objective,
                   n_trials=75,
                   gc_after_trial=True,
                   timeout=43200,
                   show_progress_bar=True)




[I 2023-10-21 04:49:26,321] A new study created in memory with name: opt_alicorp


  0%|          | 0/75 [00:00<?, ?it/s]

[I 2023-10-21 04:56:18,730] Trial 0 finished with value: 0.7090775217156632 and parameters: {'embedding_dim': 136, 'n_iter': 35, 'batch_size': 416, 'l2': 1.7185945209483771e-10, 'learning_rate': 5.298852952719548e-05, 'num_negative_samples': 35}. Best is trial 0 with value: 0.7090775217156632.
[I 2023-10-21 05:01:35,390] Trial 1 finished with value: 0.7185679244448173 and parameters: {'embedding_dim': 120, 'n_iter': 20, 'batch_size': 192, 'l2': 9.706059694479411e-11, 'learning_rate': 0.0020335667317380934, 'num_negative_samples': 15}. Best is trial 1 with value: 0.7185679244448173.
[I 2023-10-21 05:06:21,846] Trial 2 finished with value: 0.6101113493986847 and parameters: {'embedding_dim': 48, 'n_iter': 20, 'batch_size': 192, 'l2': 1.5413625068704725e-12, 'learning_rate': 9.037203487990745e-06, 'num_negative_samples': 15}. Best is trial 1 with value: 0.7185679244448173.
[I 2023-10-21 05:13:13,843] Trial 3 finished with value: 0.6638923448442501 and parameters: {'embedding_dim': 24, 'n_

# **Entrenamos y Evaluamos el modelo**

In [None]:
print(study_alicorp.best_trial.value)
print(study_alicorp.best_params)

0.752738937972121
{'embedding_dim': 144, 'n_iter': 10, 'batch_size': 128, 'l2': 3.6782894638223925e-14, 'learning_rate': 0.00019739783297380233, 'num_negative_samples': 10}


In [None]:
 # definimos el modelo

model=ImplicitFactorizationModel(

                                 loss='adaptive_hinge',
                                 embedding_dim=study_alicorp.best_params['embedding_dim'],
                                 n_iter=study_alicorp.best_params['n_iter'],
                                 batch_size=study_alicorp.best_params['batch_size'],
                                 l2=study_alicorp.best_params['l2'],
                                 learning_rate=study_alicorp.best_params['learning_rate'],
                                 num_negative_samples=study_alicorp.best_params['num_negative_samples'],
                                 use_cuda=True)


    # Entrenamos el modelo

model.fit(interactions=val_interaction_matrix,verbose=True)

Epoch 0: loss 0.919125403704352
Epoch 1: loss 0.7428832556266425
Epoch 2: loss 0.6817310158395247
Epoch 3: loss 0.6488860024748808
Epoch 4: loss 0.6225375763150787
Epoch 5: loss 0.5989675943779009
Epoch 6: loss 0.5780077375785971
Epoch 7: loss 0.5593672758553646
Epoch 8: loss 0.5406348689286959
Epoch 9: loss 0.5232980774897465


In [None]:
# evaluamos

val_recall=precision_recall_score(model,val_interaction_matrix,k=30)[1]
print(f'val_recall={np.round(val_recall.mean(),4)}')

val_recall=0.7919


In [None]:
# nos llevamos los parametros de interes

params=study_alicorp.best_params

joblib.dump(params,dir_data_kaggle+'/alicorp_params_x1.sav')