# Implémentation de l’algorithme Apriori en Python

## Ressources

* https://www.geeksforgeeks.org/apriori-algorithm/

## Importation des bibliothèques requises

In [None]:
# !pip install -U --user mlxtend

In [6]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules 

## Chargement et exploration des données

In [15]:
df = pd.read_excel('Online Retail2.xlsx') 
df.head(2) 

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [17]:
len(df)

541909

In [19]:
df.columns[:5]

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate'], dtype='object')

In [21]:
pays = list(df.Country.unique())
pays[:6]

['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany', 'Norway']

## Nettoyage des données

In [22]:
df.Description = df.Description.str.strip()

df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)

df.InvoiceNo = df.InvoiceNo.astype('str')

df = df[~df['InvoiceNo'].str.contains('C')] 

In [23]:
len(df)

532621

## fractionnement des données en fonction de la région de transaction

In [24]:
panier_France = (df[df['Country'] =="France"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
panier_UK = (df[df['Country'] =="United Kingdom"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
panier_Portugal = (df[df['Country'] =="Portugal"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 

panier_Suisse = (df[df['Country'] =="Sweden"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 

In [25]:
panier_France.iloc[1:].columns

Index(['10 COLOUR SPACEBOY PEN', '12 COLOURED PARTY BALLOONS',
       '12 EGG HOUSE PAINTED WOOD', '12 MESSAGE CARDS WITH ENVELOPES',
       '12 PENCIL SMALL TUBE WOODLAND', '12 PENCILS SMALL TUBE RED RETROSPOT',
       '12 PENCILS SMALL TUBE SKULL', '12 PENCILS TALL TUBE POSY',
       '12 PENCILS TALL TUBE RED RETROSPOT', '12 PENCILS TALL TUBE WOODLAND',
       ...
       'WRAP VINTAGE PETALS  DESIGN', 'YELLOW COAT RACK PARIS FASHION',
       'YELLOW GIANT GARDEN THERMOMETER', 'YELLOW SHARK HELICOPTER',
       'ZINC  STAR T-LIGHT HOLDER', 'ZINC FOLKART SLEIGH BELLS',
       'ZINC HERB GARDEN CONTAINER', 'ZINC METAL HEART DECORATION',
       'ZINC T-LIGHT HOLDER STAR LARGE', 'ZINC T-LIGHT HOLDER STARS SMALL'],
      dtype='object', name='Description', length=1563)

## encodage à chaud des données

In [26]:
def hot_encode(x):
    resultat = 0
    if x <= 0:
        resultat = 0
    else:
        resultat = 1
    return resultat

In [27]:
panier_encodeur = panier_France.applymap(hot_encode)
panier_France = panier_encodeur

panier_encodeur = panier_UK.applymap(hot_encode)
panier_UK = panier_encodeur


panier_encodeur = panier_Portugal.applymap(hot_encode)
panier_Portugal = panier_encodeur

panier_encodeur = panier_Suisse.applymap(hot_encode)
panier_Suisse = panier_encodeur

### a) France

In [31]:
frq_items1 = apriori(panier_France, min_support = 0.05, use_colnames = True) 
  
rules1 = association_rules(frq_items1, metric ="lift", min_threshold = 1) 
rules1 = rules1.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules1.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
45,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.0,1.306667,0.017961,inf
260,"(PLASTERS IN TIN CIRCUS PARADE, RED TOADSTOOL ...",(POSTAGE),0.05102,0.765306,0.05102,1.0,1.306667,0.011974,inf
272,"(PLASTERS IN TIN WOODLAND ANIMALS, RED TOADSTO...",(POSTAGE),0.053571,0.765306,0.053571,1.0,1.306667,0.012573,inf
301,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959
302,"(SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796


In [34]:
col = ['antecedents', 'consequents', 'support', 'confidence']

In [35]:
rules1[col].head()

Unnamed: 0,antecedents,consequents,support,confidence
45,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,1.0
260,"(PLASTERS IN TIN CIRCUS PARADE, RED TOADSTOOL ...",(POSTAGE),0.05102,1.0
272,"(PLASTERS IN TIN WOODLAND ANIMALS, RED TOADSTO...",(POSTAGE),0.053571,1.0
301,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER PLATES),0.09949,0.975
302,"(SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...",(SET/6 RED SPOTTY PAPER CUPS),0.09949,0.975


## b) La Grande Brétagne

In [37]:
frq_items2 = apriori(panier_UK, min_support = 0.05, use_colnames = True) 
  
rules2 = association_rules(frq_items2, metric ="lift", min_threshold = 1) 
rules2 = rules2.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules2.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


## c) Le Portugal

In [39]:
frq_items3 = apriori(panier_Portugal, min_support = 0.05, use_colnames = True) 
  
rules3 = association_rules(frq_items3, metric ="lift", min_threshold = 1) 
rules3 = rules3.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules3.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1170,(SET 12 COLOUR PENCILS SPACEBOY),(SET 12 COLOUR PENCILS DOLLY GIRL),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1171,(SET 12 COLOUR PENCILS DOLLY GIRL),(SET 12 COLOUR PENCILS SPACEBOY),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1172,(SET OF 4 KNICK KNACK TINS LONDON),(SET 12 COLOUR PENCILS DOLLY GIRL),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1173,(SET 12 COLOUR PENCILS DOLLY GIRL),(SET OF 4 KNICK KNACK TINS LONDON),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1174,(SET OF 4 KNICK KNACK TINS POPPIES),(SET 12 COLOUR PENCILS DOLLY GIRL),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf


## d) La Suisse

In [40]:
frq_items4 = apriori(panier_Suisse, min_support = 0.05, use_colnames = True) 
  
rules4 = association_rules(frq_items4, metric ="lift", min_threshold = 1) 
rules4 = rules4.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules4.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(PACK OF 72 SKULL CAKE CASES),(12 PENCILS SMALL TUBE SKULL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
1,(12 PENCILS SMALL TUBE SKULL),(PACK OF 72 SKULL CAKE CASES),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
4,(36 DOILIES DOLLY GIRL),(ASSORTED BOTTLE TOP MAGNETS),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
5,(ASSORTED BOTTLE TOP MAGNETS),(36 DOILIES DOLLY GIRL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
180,(CHILDRENS CUTLERY DOLLY GIRL),(CHILDRENS CUTLERY CIRCUS PARADE),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
