In [6]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF

In [7]:
# Carga de datos con pandas
table_01 = pd.read_csv('..//data//raw//dataset_sample_1.csv')

print(table_01.shape)
table_01.head()

(231000, 7)


Unnamed: 0,UUID_CLIENTE_CONSUMIDOR,PEDIDO,FECHA_SOLUCION,COD_PRODUCTO,CATEGORIA,UNIDADES_BRUTAS,VENTA_BRUTA_CON_IVA
0,5F333C92C61098CC840A180313615250,39562883,2023-09-28,26605,Jabones,1,10043.0
1,323C3C3B1404F866097F000001615250,39758414,2023-10-14,28308,Cuidado Capilar F,1,23859.0
2,6C2FA988251C4F35BD0A180313615250,39107912,2023-08-23,23610,Carnes Frias,1,20152.0
3,37908B3B6309B1549E7F000001615250,39197173,2023-08-30,18460,Cuidado Capilar F,1,21500.0
4,85C5EF6E09B085614D0A180327615250,36658943,2023-01-18,28485,Cuidado Oral,1,10100.0


In [4]:
# Mantener solo top 1000 productos más vendidos
top_products = table_01['COD_PRODUCTO'].value_counts().head(1000).index
table_01 = table_01[table_01['COD_PRODUCTO'].isin(top_products)]

# Mantener solo clientes con más de 5 compras
user_counts = table_01['UUID_CLIENTE_CONSUMIDOR'].value_counts()
active_users = user_counts[user_counts > 5].index
table_01 = table_01[table_01['UUID_CLIENTE_CONSUMIDOR'].isin(active_users)]

print(table_01.shape)
table_01.head()

(111004, 7)


Unnamed: 0,UUID_CLIENTE_CONSUMIDOR,PEDIDO,FECHA_SOLUCION,COD_PRODUCTO,CATEGORIA,UNIDADES_BRUTAS,VENTA_BRUTA_CON_IVA
0,5F333C92C61098CC840A180313615250,39562883,2023-09-28,26605,Jabones,1,10043.0
1,323C3C3B1404F866097F000001615250,39758414,2023-10-14,28308,Cuidado Capilar F,1,23859.0
12,86E0CE2C200BF0C2A10A180327615250,38267249,2023-06-08,32830,Maquillaje,1,19000.0
14,5E5F010D2A5E54EE990A180313615250,39349457,2023-09-11,96954,Colonias,1,51578.0
16,7F31482E2C4CF149860A180326615250,39621969,2023-10-04,36317,Cuidado Capilar P,1,14191.0


In [5]:
# Construcción de la matriz usuario–producto
pivot = table_01.pivot_table(
    index='UUID_CLIENTE_CONSUMIDOR',
    columns='COD_PRODUCTO',
    values='VENTA_BRUTA_CON_IVA',
    aggfunc='mean',
    fill_value=0
)

print(pivot.shape)
pivot.head()

(8278, 1000)


COD_PRODUCTO,2,10,19,24,25,64,65,70,82,87,...,97722,97800,97807,97811,97812,97819,97820,97826,97902,97943
UUID_CLIENTE_CONSUMIDOR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3232AECD7274E2EDF37F000001615250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11520.0,0.0,13376.0,0.0,0.0,0.0
3232B5702778C5B6947F000001615250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3232B5AB3F718C6AF77F000001615250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3232BC55E3327C718F7F000001615250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14310.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3232CEBED8451AEC547F000001615250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Entrenamiento del modelo NMF

nmf = NMF(n_components=10, random_state=42, max_iter=150)
user_factors = nmf.fit_transform(pivot)
item_factors = nmf.components_
user_factors.shape, item_factors.shape

((8278, 10), (10, 1000))

In [7]:
# Reconstrucción de la matriz predicha
pred_matrix = user_factors.dot(item_factors)
pred_df = pd.DataFrame(pred_matrix, index=pivot.index, columns=pivot.columns)
pred_df.head()

COD_PRODUCTO,2,10,19,24,25,64,65,70,82,87,...,97722,97800,97807,97811,97812,97819,97820,97826,97902,97943
UUID_CLIENTE_CONSUMIDOR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3232AECD7274E2EDF37F000001615250,4.157549,1.006589,0.164496,1234.519114,1.140988,26.822805,42.479288,3.37902,0.613124,6.562646,...,0.100818,0.567287,0.705889,3.814528,0.560297,4.410803,1.527425,0.202912,5.317005,0.190685
3232B5702778C5B6947F000001615250,0.121099,0.015265,0.001988,6.475864,0.025116,0.146827,0.228171,0.01872,0.012368,0.036516,...,0.000894,0.01009,0.060879,0.067929,0.0073,0.039385,0.028136,0.003795,0.02984,0.004365
3232B5AB3F718C6AF77F000001615250,0.049581,0.007385,0.001191,6.26473,0.010975,0.111577,0.221522,0.016695,0.005801,0.029392,...,0.000611,0.004835,0.020824,0.032153,0.003983,0.029523,0.013576,0.001795,0.0226,0.001946
3232BC55E3327C718F7F000001615250,0.332508,0.245831,0.02571,42.437525,0.25892,1.387385,1.783947,0.130025,0.199155,0.292032,...,0.018637,0.121534,0.140255,0.769418,0.103675,0.688144,0.332105,0.063896,0.308894,0.060176
3232CEBED8451AEC547F000001615250,2.38555,0.735908,0.052346,484.800878,0.690575,23.867578,14.132662,1.722907,0.32102,4.764542,...,0.035253,0.322402,0.568613,2.189243,0.279569,0.437207,0.775594,0.109464,4.498984,0.088661


In [8]:
# Generar recomendaciones personalizadas

def recomendar_productos(usuario, n=5):
    """Devuelve los top-n productos recomendados para un usuario."""
    # Predicciones del usuario
    predicciones = pred_df.loc[usuario]
    # Productos que ya compró
    comprados = pivot.loc[usuario]
    # Filtrar solo los que no compró
    no_comprados = predicciones[comprados == 0]
    # Ordenar y tomar los top-n
    top_n = no_comprados.sort_values(ascending=False).head(n)
    return top_n


In [None]:
cod_products = recomendar_productos('5F333C92C61098CC840A180313615250', n=5)
cod_products # con puntuación de afinidad

COD_PRODUCTO
18410    5713.780415
18409    5324.149910
19347    3889.229032
96952    1135.773499
23756     966.515210
Name: 5F333C92C61098CC840A180313615250, dtype: float64

In [21]:
table_01[table_01['COD_PRODUCTO'].isin(cod_products.index)][['COD_PRODUCTO', 'CATEGORIA']].drop_duplicates(subset=["COD_PRODUCTO"])

Unnamed: 0,COD_PRODUCTO,CATEGORIA
42,18409,Colonias
283,23756,Golosinas
802,19347,Cuidado Capilar F
3025,96952,Colonias
5572,18410,Colonias


In [22]:
import joblib

# Guardar
joblib.dump({
    'nmf': nmf,
    'pivot_columns': pivot.columns,
    'pivot_index': pivot.index,
    'pred_df': pred_df
}, "..//models//recommender_full.pkl")

['..//models//recommender_full.pkl']

In [4]:
# Cargar
import joblib

data = joblib.load("..//models//recommender_full.pkl")
nmf = data['nmf']
pred_df = data['pred_df']

In [10]:
# Precision@K

# Diccionario de productos realmente comprados por usuario
actuals = (
    table_01.groupby('UUID_CLIENTE_CONSUMIDOR')['COD_PRODUCTO']
    .apply(set)
    .to_dict()
)

In [12]:
# Función para calcular Precision@K

def precision_at_k(pred_df, actuals, k=5):
    precisions = []
    for user, true_items in actuals.items():
        if user not in pred_df.index:
            continue
        # Top K productos recomendados
        top_k = pred_df.loc[user].sort_values(ascending=False).head(k).index
        # Intersección con los productos realmente comprados
        hits = len(set(top_k) & set(true_items))
        precisions.append(hits / k)
    return np.mean(precisions)


In [13]:
for k in [5, 10, 20]:
    p = precision_at_k(pred_df, actuals, k=k)
    print(f"Precision@{k}: {p:.4f}")

Precision@5: 0.1134
Precision@10: 0.0748
Precision@20: 0.0562


Precision@5 = 0.1134, significa que, en promedio, el 11% de las 5 recomendaciones más altas sí estaban entre los productos que el usuario compró.