In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/Groceries_dataset.csv')
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


Antes de realizar el análisis de la cesta de la compra, necesitamos convertir estos datos a un formato que el algoritmo Apriori pueda procesar fácilmente. En otras palabras, necesitamos convertirlos en una estructura tabular compuesta por unos y ceros

In [3]:
df["single_transaction"] = df["Member_number"].astype(str)+'_'+df["Date"].astype(str)
df.head()

Unnamed: 0,Member_number,Date,itemDescription,single_transaction
0,1808,21-07-2015,tropical fruit,1808_21-07-2015
1,2552,05-01-2015,whole milk,2552_05-01-2015
2,2300,19-09-2015,pip fruit,2300_19-09-2015
3,1187,12-12-2015,other vegetables,1187_12-12-2015
4,3037,01-02-2015,whole milk,3037_01-02-2015


In [4]:
df2 = pd.crosstab(df['single_transaction'], df['itemDescription'])
df2.head()

itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
single_transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000_15-03-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1000_24-06-2014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1000_24-07-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000_25-11-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000_27-05-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


La tabla resultante nos indica cuántas veces se ha comprado cada artículo en una transacción

Hay más de cien columnas, aunque la mayoría de las personas solo compran dos o tres artículos, por lo que esta tabla es escasa y está compuesta principalmente de ceros.

El paso final de preprocesamiento de datos implica codificar todos los valores en el marco de datos anterior en 0 y 1.

Esto significa que incluso si hay varios ejemplares del mismo artículo en la misma transacción, el valor se codificará en 1, ya que el análisis de la cesta de compra no tiene en cuenta la frecuencia de compra.

In [5]:
def encode(item_freq):
    res = 0
    if item_freq > 0:
        res = 1
    return res
    
basket_input = df2.applymap(encode)

In [6]:
basket_input.value_counts

<bound method DataFrame.value_counts of itemDescription     Instant food products  UHT-milk  abrasive cleaner  \
single_transaction                                                      
1000_15-03-2015                         0         0                 0   
1000_24-06-2014                         0         0                 0   
1000_24-07-2015                         0         0                 0   
1000_25-11-2015                         0         0                 0   
1000_27-05-2015                         0         0                 0   
...                                   ...       ...               ...   
4999_24-01-2015                         0         0                 0   
4999_26-12-2015                         0         0                 0   
5000_09-03-2014                         0         0                 0   
5000_10-02-2015                         0         0                 0   
5000_16-11-2014                         0         0                 0   

itemDescri

### Construir el algoritmo Apriori para el análisis de la cesta de compra

In [7]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [8]:
frequent_itemsets = apriori(basket_input, min_support=0.001, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift")
rules.head()



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(bottled water),(UHT-milk),0.060683,0.021386,0.001069,0.017621,0.823954,1.0,-0.000228,0.996168,-0.185312,0.013201,-0.003847,0.033811
1,(UHT-milk),(bottled water),0.021386,0.060683,0.001069,0.05,0.823954,1.0,-0.000228,0.988755,-0.179204,0.013201,-0.011373,0.033811
2,(UHT-milk),(other vegetables),0.021386,0.122101,0.002139,0.1,0.818993,1.0,-0.000473,0.975443,-0.184234,0.01513,-0.025175,0.058758
3,(other vegetables),(UHT-milk),0.122101,0.021386,0.002139,0.017515,0.818993,1.0,-0.000473,0.99606,-0.201119,0.01513,-0.003956,0.058758
4,(sausage),(UHT-milk),0.060349,0.021386,0.001136,0.018826,0.880298,1.0,-0.000154,0.997391,-0.126418,0.014096,-0.002616,0.035976


Aquí, las columnas “antecedentes” y “consecuentes” muestran los artículos que frecuentemente se compran juntos.

In [9]:
rules.sort_values(["support", "confidence","lift"],axis = 0, ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
622,(rolls/buns),(whole milk),0.110005,0.157923,0.013968,0.126974,0.804028,1.0,-0.003404,0.964550,-0.214986,0.055000,-0.036752,0.107711
623,(whole milk),(rolls/buns),0.157923,0.110005,0.013968,0.088447,0.804028,1.0,-0.003404,0.976350,-0.224474,0.055000,-0.024222,0.107711
694,(yogurt),(whole milk),0.085879,0.157923,0.011161,0.129961,0.822940,1.0,-0.002401,0.967861,-0.190525,0.047975,-0.033206,0.100317
695,(whole milk),(yogurt),0.157923,0.085879,0.011161,0.070673,0.822940,1.0,-0.002401,0.983638,-0.203508,0.047975,-0.016634,0.100317
550,(soda),(other vegetables),0.097106,0.122101,0.009691,0.099794,0.817302,1.0,-0.002166,0.975219,-0.198448,0.046252,-0.025410,0.089579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,(rolls/buns),(detergent),0.110005,0.008621,0.001002,0.009113,1.057037,1.0,0.000054,1.000496,0.060629,0.008523,0.000496,0.062696
616,(rolls/buns),(soft cheese),0.110005,0.010025,0.001002,0.009113,0.909052,1.0,-0.000100,0.999080,-0.101053,0.008422,-0.000921,0.054557
536,(other vegetables),(pot plants),0.122101,0.007819,0.001002,0.008210,1.049991,1.0,0.000048,1.000394,0.054233,0.007776,0.000394,0.068208
545,(other vegetables),(semi-finished bread),0.122101,0.009490,0.001002,0.008210,0.865133,1.0,-0.000156,0.998710,-0.150796,0.007677,-0.001292,0.056922


La tabla resultante muestra que las cuatro combinaciones de productos más populares que se compran frecuentemente juntas son:

    Panecillos y leche
    Yogur y leche
    Embutidos y leche
    Refresco y verduras

Una razón para esto podría ser que la tienda de comestibles realizó una promoción de estos artículos juntos o los exhibió en la misma línea de visión para mejorar las ventas.