In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
# Carga de datos con pandas
table_01 = pd.read_csv('..//data//raw//dataset_sample_1.csv')

print(table_01.shape)
table_01.head()

(231000, 7)


Unnamed: 0,UUID_CLIENTE_CONSUMIDOR,PEDIDO,FECHA_SOLUCION,COD_PRODUCTO,CATEGORIA,UNIDADES_BRUTAS,VENTA_BRUTA_CON_IVA
0,5F333C92C61098CC840A180313615250,39562883,2023-09-28,26605,Jabones,1,10043.0
1,323C3C3B1404F866097F000001615250,39758414,2023-10-14,28308,Cuidado Capilar F,1,23859.0
2,6C2FA988251C4F35BD0A180313615250,39107912,2023-08-23,23610,Carnes Frias,1,20152.0
3,37908B3B6309B1549E7F000001615250,39197173,2023-08-30,18460,Cuidado Capilar F,1,21500.0
4,85C5EF6E09B085614D0A180327615250,36658943,2023-01-18,28485,Cuidado Oral,1,10100.0


In [3]:
# Mantener solo top 1000 productos más vendidos
top_products = table_01['COD_PRODUCTO'].value_counts().head(1000).index
table_01 = table_01[table_01['COD_PRODUCTO'].isin(top_products)]

# Mantener solo clientes con más de 5 compras
user_counts = table_01['UUID_CLIENTE_CONSUMIDOR'].value_counts()
active_users = user_counts[user_counts > 5].index
table_01 = table_01[table_01['UUID_CLIENTE_CONSUMIDOR'].isin(active_users)]

print(table_01.shape)
table_01.head()

(111004, 7)


Unnamed: 0,UUID_CLIENTE_CONSUMIDOR,PEDIDO,FECHA_SOLUCION,COD_PRODUCTO,CATEGORIA,UNIDADES_BRUTAS,VENTA_BRUTA_CON_IVA
0,5F333C92C61098CC840A180313615250,39562883,2023-09-28,26605,Jabones,1,10043.0
1,323C3C3B1404F866097F000001615250,39758414,2023-10-14,28308,Cuidado Capilar F,1,23859.0
12,86E0CE2C200BF0C2A10A180327615250,38267249,2023-06-08,32830,Maquillaje,1,19000.0
14,5E5F010D2A5E54EE990A180313615250,39349457,2023-09-11,96954,Colonias,1,51578.0
16,7F31482E2C4CF149860A180326615250,39621969,2023-10-04,36317,Cuidado Capilar P,1,14191.0


In [13]:
basket = (
    table_01.groupby(['PEDIDO', 'CATEGORIA'])['UNIDADES_BRUTAS']
    .sum().unstack().fillna(0)
)

# Convertir a binario (1 = comprado, 0 = no comprado)
basket = basket.map(lambda x: 1 if x > 0 else 0)

print(basket.shape)
basket.head()

(40716, 52)


CATEGORIA,Aceites,Afeitado,Alimentos Larga Vida Y Conservas,Aromaterapia,Aseo Del Hogar,Aseo Mascotas,Baby Food,Bebidas Listas Para Consumir,Bebidas Para Preparar,Bolsos,...,Productos Congelados,Protección Solar F,Protección Solar P,Proteína Vegetal,Quesos,Reposteria,"Sal, Salsas, Aderezos Y Especias",Snacks Y Frutos Secos,Styling,"Vitaminas, Minerales Y Suplementos"
PEDIDO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36256229,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36256288,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
36256416,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
36256666,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36256681,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
frequent_itemsets = apriori(basket, min_support=0.005, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5).sort_values('confidence', ascending=False)

frequent_itemsets.shape, rules.shape



((199, 2), (424, 14))

In [18]:
rules[['antecedents', 'consequents', 'confidence']].head(20)

Unnamed: 0,antecedents,consequents,confidence
413,"(Esparcibles, Golosinas)",(Galletas),0.524313
420,"(Golosinas, Snacks Y Frutos Secos)",(Galletas),0.509214
282,"(Golosinas, Alimentos Larga Vida Y Conservas)",(Galletas),0.504216
346,"(Esparcibles, Bebidas Para Preparar)",(Galletas),0.503467
311,"(Bebidas Listas Para Consumir, Golosinas)",(Galletas),0.501048
364,"(Bebidas Para Preparar, Pastas)",(Galletas),0.49916
353,"(Bebidas Para Preparar, Golosinas)",(Galletas),0.497358
408,"(Golosinas, Cuidado Oral)",(Galletas),0.495652
395,"(Carnes Frias, Snacks Y Frutos Secos)",(Galletas),0.494432
299,"(Bebidas Para Preparar, Bebidas Listas Para Co...",(Galletas),0.492519


In [19]:
rules[['antecedents', 'consequents', 'confidence']].head(20).to_csv("..//data//processed//mba.csv", index=False)