# 0.0. IMPORTS 

In [89]:
import pandas            as pd
import numpy             as np
import matplotlib.pyplot as plt
import seaborn           as sns 

from mlxtend.preprocessing     import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

## 0.1. Help Functions 

In [55]:
#função para remover os zeros.
def removeAllzero( _list, value ):
    return list( filter( lambda x: x != value, _list ) )

## 0.2. Loading Data

In [33]:
#importando o dataset.
df_raw = pd.read_csv( 'groceries - groceries.csv' )

df_raw.head()

Unnamed: 0,Item(s),Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,4,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,...,,,,,,,,,,
1,3,tropical fruit,yogurt,coffee,,,,,,,...,,,,,,,,,,
2,1,whole milk,,,,,,,,,...,,,,,,,,,,
3,4,pip fruit,yogurt,cream cheese,meat spreads,,,,,,...,,,,,,,,,,
4,4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,...,,,,,,,,,,


# 1.0. DESCRIBE DATA

In [34]:
#criando uma copia do dataset
df1 = df_raw.copy()

## 1.1. Data Dimensions 

In [35]:
#verificar as dimensões do dataset
df1.shape

(9835, 33)

## 1.2. Number of NA

In [36]:
#verifiando os valores nulos
df1.isna().sum()

Item(s)       0
Item 1        0
Item 2     2159
Item 3     3802
Item 4     5101
Item 5     6106
Item 6     6961
Item 7     7606
Item 8     8151
Item 9     8589
Item 10    8939
Item 11    9185
Item 12    9367
Item 13    9484
Item 14    9562
Item 15    9639
Item 16    9694
Item 17    9740
Item 18    9769
Item 19    9783
Item 20    9797
Item 21    9806
Item 22    9817
Item 23    9821
Item 24    9827
Item 25    9828
Item 26    9828
Item 27    9829
Item 28    9830
Item 29    9831
Item 30    9834
Item 31    9834
Item 32    9834
dtype: int64

## 1.3. Fillout NA

In [82]:
#vamos mudar os valores NaN para zero para ficar mais fácil de tratar.
df1.replace( np.nan, 0, inplace = True )
df1.drop( 'Item(s)', axis = 1, inplace =True )
df1.head()



Unnamed: 0,Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,Item 10,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,citrus fruit,semi-finished bread,margarine,ready soups,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,tropical fruit,yogurt,coffee,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,whole milk,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,pip fruit,yogurt,cream cheese,meat spreads,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,other vegetables,whole milk,condensed milk,long life bakery product,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 2.0. DATA PREPARATION

In [86]:
df2 = df1.copy()

In [83]:
#aqui vamos converter o dataset em listas de lista para trabalhar com o algoritimo apriori.
list_all = []

# nesse for pegamos as linhas do dataset e criamos listas para cada uma em seguida removemos todos os zeros de cada lista e 
#por ultimo adicionamos todas as listas geradas dentro de uma outra lista.
for index, row in df1.iterrows():
    list_transaction = row.values.tolist()
    list_transaction = removeAllzero( list_transaction, 0 )
    list_all.append( list_transaction )


#mostramos as listas     
print( list_all[0] )

['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups']


In [85]:
#aqui mudamos a lista de volta para um dataframe colocando os valores unicos das linhas.
te = TransactionEncoder()
te_ary = te.fit( list_all ).transform(list_all )
df = pd.DataFrame( te_ary, columns=te.columns_ )

df.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [93]:
#definindo o suporte minimo para o algoritimo apriori
frequent_itemsets = apriori( df, min_support = 0.01, use_colnames=True )

#mostrando os suportes do maior para o menor
frequent_itemsets.sort_values( by=['support'], ascending = False )

Unnamed: 0,support,itemsets
86,0.255516,(whole milk)
55,0.193493,(other vegetables)
66,0.183935,(rolls/buns)
75,0.174377,(soda)
87,0.139502,(yogurt)
...,...,...
178,0.010066,"(frankfurter, sausage)"
306,0.010066,"(yogurt, curd, whole milk)"
160,0.010066,"(rolls/buns, curd)"
212,0.010066,"(tropical fruit, napkins)"


In [99]:
#aqui passamos os parametros para que o algoritmo consiga analisar as 'melhores recomendações'.
rules = association_rules( frequent_itemsets, metric = 'confidence', min_threshold =0.5 ) 
rules.sort_values( by  =['lift'], ascending = False).drop(['antecedent support', 'consequent support', 'leverage','conviction'], axis = 1 )

Unnamed: 0,antecedents,consequents,support,confidence,lift
1,"(citrus fruit, root vegetables)",(other vegetables),0.010371,0.586207,3.029608
6,"(tropical fruit, root vegetables)",(other vegetables),0.012303,0.584541,3.020999
5,"(rolls/buns, root vegetables)",(other vegetables),0.012201,0.502092,2.59489
7,"(yogurt, root vegetables)",(other vegetables),0.012913,0.5,2.584078
2,"(yogurt, curd)",(whole milk),0.010066,0.582353,2.279125
0,"(butter, other vegetables)",(whole milk),0.01149,0.573604,2.244885
11,"(tropical fruit, root vegetables)",(whole milk),0.011998,0.570048,2.230969
12,"(yogurt, root vegetables)",(whole milk),0.01454,0.562992,2.203354
3,"(domestic eggs, other vegetables)",(whole milk),0.012303,0.552511,2.162336
14,"(yogurt, whipped/sour cream)",(whole milk),0.01088,0.52451,2.052747
