In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import make_scorer, mean_absolute_error
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split as surprise_train_test_split
from xgboost import XGBRegressor
import optuna
import shap

In [None]:
# Especificar ruta al repo clonado
os.chdir("/home/cesar/Documentos/banorte_test")
# Lectura de la informacion
ratings = pd.read_csv('./data/ratings.dat', sep='::', engine='python', header=None, 
                      names=['UserID','MovieID','Rating','Timestamp'])
users = pd.read_csv('./data/users.dat', sep='::', engine='python', header=None, 
                      names=['UserID','Gender','Age','Occupation','Zip-code'])
movies = pd.read_csv('./data/movies.dat', sep='::', engine='python', header=None, encoding='latin1',
                      names=['MovieID','Title','Genres'])

In [None]:
### Exploracion
# Revisamos porcentaje de nulos
print(ratings.isna().mean()*100)
print(users.isna().mean()*100)
print(movies.isna().mean()*100)

In [None]:
# Revisamos la cantidad de codigos postales que hay
print(users['Zip-code'].nunique())
# Remover repetidos
ratings = ratings.drop_duplicates(subset=['UserID', 'MovieID'])
users = users.drop_duplicates(subset=['UserID'])
movies = movies.drop_duplicates(subset=['MovieID'])

In [None]:
# Creacion del dataframe
df = pd.merge(ratings.drop(['Timestamp'],axis=1), users.drop(['Zip-code'],axis=1), on='UserID', how='left')
df = pd.merge(df, movies.drop(['Title'],axis=1), on='MovieID', how='left')

In [None]:
# Expandimos la columna del genero de la pelicula
generos = df['Genres'].str.split('|', expand=True)
generos.columns = [f'genero_{i+1}' for i in range(generos.shape[1])]
generos[pd.notnull(generos['genero_6'])]
generos = generos.fillna('ninguno')
df = pd.concat([df, generos], axis=1)
df = df.drop(['Genres'], axis=1)
df[['UserID','MovieID','Age','Occupation']] = df[['UserID','MovieID','Age','Occupation']].astype(str)

In [None]:
# Ratings por ususario
def rating_por_genero(user: str):
    dat = df[df['UserID'] == user]
    mean_rating = dat.groupby(['genero_1']).agg('Rating').mean()
    plt.figure(figsize=(10, 6))
    mean_rating.plot(kind='bar', color='blue', edgecolor='black')
    plt.title('Rating promedio por género', fontsize=14)
    plt.xlabel('Género', fontsize=12)
    plt.ylabel('Rating promedio', fontsize=12)
    plt.xticks(rotation=90, ha='right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('figures/rating.png')
    # Mostrar el gráfico
    plt.show()

rating_por_genero('2')

In [None]:
### Modelo de pronostico
df['Rating'] = df['Rating'].astype(float)
reader = Reader(rating_scale=(1, 5))  # Escala de ratings de 1 a 5
# Solo captura relaciones entre users y peliculas
data = Dataset.load_from_df(df[['UserID','MovieID','Rating']], reader)
trainset, _ = surprise_train_test_split(data, test_size=0.2, random_state=42)
svd_model = SVD()
svd_model.fit(trainset)

# Generar pronósticos para cada fila del DataFrame
def predecir_rating(row):
    prediccion = svd_model.predict(row['UserID'], row['MovieID'])
    return prediccion.est  # Pronóstico del rating

In [None]:
# Agregar el pronostico svd como feature al dataframe original
df['svd_prediction'] = df.apply(predecir_rating, axis=1)
df.head(10)

In [None]:
# OHE
df_F = df.drop(['UserID','MovieID'], axis=1)
categorical_columns = df_F.columns.drop(['Rating','svd_prediction'])

df_encoded = pd.get_dummies(df_F.drop(['Rating','svd_prediction'], axis=1), columns=categorical_columns, 
                            prefix=categorical_columns)
df_encoded = df_encoded.astype(int)
df_final = pd.concat([df_F[['Rating','svd_prediction']], df_encoded], axis=1)
df_final.to_csv("./data/df_final.csv")