## **Fouille de données (2023-2024)**

### **Travaux pratiques n°3**

---

**Importation des bibliothèques**

In [None]:
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

**Assemblage Google Drive dans Colaboratory**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


<font color=green>**Définition de la fonction $gener\_regles(df\_rules, cond=$"antecedents", $res=$"consequents"$)$**</font>

In [None]:
def gener_regles(df_rules, cond="antecedents", res="consequents"):
  ant=map(lambda x : list(map(lambda y: y, x)), df_rules[cond])
  conseq=map(lambda x : list(map(lambda y: y, x)), df_rules[res])
  return "\n".join(map(lambda x, y : ", ".join(x)+"-->"+", ".join(y) ,ant, conseq))

**1/** Charger le jeu de données **market.csv** dans une variable nommée $df\_market$. Afficher les $5$ premières lignes de $df\_market$

In [None]:
df_market=pd.read_csv('/content/drive/MyDrive/market.csv', sep=",")
df_market.head(5)

Unnamed: 0,ticketNumber,itemDescription
0,1808,tropical fruit
1,2552,whole milk
2,1187,other vegetables
3,3037,whole milk
4,4941,rolls/buns


**2/** Transformer la colonne **itemDescription** du DataFrame $df\_market$ en $10$ <u>**colonnes binaires**</u> distinctes portant les mêmes noms que les articles correspondants


In [None]:
df_market=pd.get_dummies(df_market, columns=['itemDescription'],prefix='',prefix_sep='')
df_market

Unnamed: 0,ticketNumber,bottled water,citrus fruit,other vegetables,rolls/buns,root vegetables,sausage,soda,tropical fruit,whole milk,yogurt
0,1808,0,0,0,0,0,0,0,1,0,0
1,2552,0,0,0,0,0,0,0,0,1,0
2,1187,0,0,1,0,0,0,0,0,0,0
3,3037,0,0,0,0,0,0,0,0,1,0
4,4941,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
13731,3820,0,0,0,0,0,0,1,0,0,0
13732,1176,1,0,0,0,0,0,0,0,0,0
13733,3082,0,0,0,0,0,0,0,0,1,0
13734,4586,1,0,0,0,0,0,0,0,0,0


**3/** Créer un nouveau DataFrame $trans$ à partir de $df\_market$, où les lignes sont regroupées par numéro de ticket (**ticketNumber**), et <u>les valeurs dans les colonnes binaires sont additionnées pour chaque groupe</u>

In [None]:

trans=df_market.groupby(['ticketNumber']).sum().reset_index()
#print(df_market.groupby(['ticketNumber']).sum())
trans

Unnamed: 0,ticketNumber,bottled water,citrus fruit,other vegetables,rolls/buns,root vegetables,sausage,soda,tropical fruit,whole milk,yogurt
0,1000,0,0,0,0,0,2,1,0,2,1
1,1001,0,0,0,1,0,1,2,0,2,0
2,1002,0,0,1,0,0,0,0,1,1,0
3,1003,0,0,0,3,1,1,0,0,0,0
4,1004,0,0,2,2,1,0,0,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...
3651,4996,1,0,0,1,0,0,1,1,0,0
3652,4997,0,0,0,0,0,0,0,1,1,0
3653,4998,0,0,0,1,0,0,0,0,0,0
3654,4999,1,0,2,0,0,0,0,1,0,1


**4/** Supprimer la colonne **ticketNumber** à partir de $trans$ et modifier les autres colonnes de manière à attribuer la valeur $1$ si un article apparaît plus d’une fois dans le ticket, et $0$ pour les autres occurrences

In [None]:
trans.applymap(lambda x: 1 if x >= 1 else 0 )

Unnamed: 0,ticketNumber,bottled water,citrus fruit,other vegetables,rolls/buns,root vegetables,sausage,soda,tropical fruit,whole milk,yogurt
0,1,0,0,0,0,0,1,1,0,1,1
1,1,0,0,0,1,0,1,1,0,1,0
2,1,0,0,1,0,0,0,0,1,1,0
3,1,0,0,0,1,1,1,0,0,0,0
4,1,0,0,1,1,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
3651,1,1,0,0,1,0,0,1,1,0,0
3652,1,0,0,0,0,0,0,0,1,1,0
3653,1,0,0,0,1,0,0,0,0,0,0
3654,1,1,0,1,0,0,0,0,1,0,1


**5/** Afficher le type de chaque attribut dans $trans$ et les modifier en un type **booléen** (**bool**)

In [None]:

trans=trans.astype("bool")

**6/** Déterminer les $k$-items fréquents ($k≥1$) en appliquant l’algorithme **Apriori**, avec une <u>**valeur minimale de support**</u> égale à $0.05$. Enregistrer les résultats dans un DataFrame nommé $f\_items$

In [None]:
f_items= apriori(trans, min_support=1,use_colnames=True)
f_items

Unnamed: 0,support,itemsets
0,1.0,(ticketNumber)


**7/** Afficher l’ensemble des $k$-items fréquents ($k≥2$) contenant l’item **'whole milk'** à partir de $f\_items$. <font color=blue>***Utiliser l’une des fonctions de comparaison vectorisées des DataFrames (eq(==), ge(≥), ou gt(>))***</font>

In [None]:
f_items[f_items['itemsets'].gt({'whole milk'})]

Unnamed: 0,support,itemsets
19,0.488512,"(ticketNumber, whole milk)"
27,0.119803,"(bottled water, whole milk)"
32,0.098468,"(citrus fruit, whole milk)"
39,0.204048,"(other vegetables, whole milk)"
45,0.190372,"(rolls/buns, whole milk)"
50,0.120624,"(root vegetables, whole milk)"
54,0.114059,"(sausage, whole milk)"
57,0.161105,"(soda, whole milk)"
59,0.124179,"(tropical fruit, whole milk)"
61,0.160558,"(yogurt, whole milk)"


**8/** Déterminer toutes les <u>**règles d’association pertinentes**</u> à partir de $f\_items$ sachant que la confiance minimale est fixée à $0.2$. Enregistrer le résultat dans un DataFrame nommé $rules\_p$ et afficher-le

In [None]:
rules_p= association_rules(f_items, metric="confidence",min_threshold=0.2)
rules_p

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(ticketNumber),(bottled water),1.000000,0.227845,0.227845,0.227845,1.000000,0.000000,1.000000,0.000000
1,(bottled water),(ticketNumber),0.227845,1.000000,0.227845,1.000000,1.000000,0.000000,inf,0.000000
2,(citrus fruit),(ticketNumber),0.197757,1.000000,0.197757,1.000000,1.000000,0.000000,inf,0.000000
3,(ticketNumber),(other vegetables),1.000000,0.401532,0.401532,0.401532,1.000000,0.000000,1.000000,0.000000
4,(other vegetables),(ticketNumber),0.401532,1.000000,0.401532,1.000000,1.000000,0.000000,inf,0.000000
...,...,...,...,...,...,...,...,...,...,...
465,"(soda, yogurt, whole milk)",(ticketNumber),0.057987,1.000000,0.057987,1.000000,1.000000,0.000000,inf,0.000000
466,"(ticketNumber, yogurt, whole milk)",(soda),0.160558,0.334245,0.057987,0.361158,1.080520,0.004321,1.042128,0.088773
467,"(soda, yogurt)","(ticketNumber, whole milk)",0.103939,0.488512,0.057987,0.557895,1.142029,0.007212,1.156937,0.138791
468,"(soda, whole milk)","(ticketNumber, yogurt)",0.161105,0.301696,0.057987,0.359932,1.193030,0.009382,1.090984,0.192870


**9/** Identifier et afficher les $5$ règles d’association <u>**les plus intéressantes**</u> en termes de <font color=red>**lift**</font> à partir de $rules\_p$. Une règle doit avoir la forme $item → item$, $items → item$ ou $items → item$. <font color=blue>***Utiliser la fonction donnée $generer\_regles()$***</font>

In [None]:
def gener_regle11(df_rules):
  listofrows=df_rules[['antecedents','consequents']].apply(lambda row: row.tolist(),axis=1)
  rowsaslists=listofrows.tolist()
  for row in rowsaslists:
    antec=','.join(row[0])
    cons=','.join(row[1])
    rule=f"{antec}-->{cons}"
    print(rule)
rules_p.sort_values(by="lift",ascending=False,inplace=True)
print(gener_regle11(rules_p.head()))



bottled water-->ticketNumber,other vegetables,whole milk
other vegetables,whole milk-->bottled water
ticketNumber,other vegetables,whole milk-->bottled water
bottled water-->other vegetables,whole milk
ticketNumber,bottled water-->other vegetables,whole milk
None


In [None]:
rules_p.sort_values(by="lift",ascending=False,inplace=True)
print(gener_regles(rules_p.head()))

bottled water, ticketNumber-->other vegetables, whole milk
other vegetables, whole milk-->bottled water
bottled water-->other vegetables, ticketNumber, whole milk
other vegetables, ticketNumber, whole milk-->bottled water
other vegetables, whole milk-->bottled water, ticketNumber


**10/** Identifier et afficher toutes les règles d’association qui <u>**comportent dans l’antécédent l’article le plus fréquent**</u>, à partir de $rules\_p$, , sous la forme $item→item$, $items→item$ ou $items→item$

In [None]:
_1_items1 =f_items[f_items['itemsets'].apply(lambda x: len(x)==1)]
_1_items1

Unnamed: 0,support,itemsets
0,1.0,(ticketNumber)
1,0.227845,(bottled water)
2,0.197757,(citrus fruit)
3,0.401532,(other vegetables)
4,0.372812,(rolls/buns)
5,0.245897,(root vegetables)
6,0.219639,(sausage)
7,0.334245,(soda)
8,0.249179,(tropical fruit)
9,0.488512,(whole milk)


In [None]:
_1_items1 =f_items[f_items['itemsets'].apply(lambda x: len(x)==1)]
item_f=_1_items1[_1_items1['support']==_1_items1['support'].max()]['itemsets'].reset_index(drop=True)[0]
item_f

frozenset({'ticketNumber'})

In [None]:
print(gener_regles(rules_p[rules_p['antecedents'].ge({list(item_f)[0]})]))

bottled water, ticketNumber-->other vegetables, whole milk
other vegetables, ticketNumber, whole milk-->bottled water
bottled water, ticketNumber, whole milk-->other vegetables
ticketNumber, yogurt-->other vegetables, whole milk
other vegetables, ticketNumber, whole milk-->yogurt
ticketNumber, whole milk, rolls/buns-->sausage
ticketNumber, sausage-->whole milk, rolls/buns
other vegetables, bottled water, ticketNumber-->whole milk
ticketNumber, whole milk, rolls/buns-->yogurt
ticketNumber, yogurt-->whole milk, rolls/buns
ticketNumber, whole milk, sausage-->rolls/buns
other vegetables, ticketNumber, yogurt-->whole milk
ticketNumber, sausage-->yogurt
ticketNumber, yogurt-->sausage
ticketNumber, yogurt, rolls/buns-->whole milk
rolls/buns, ticketNumber, sausage-->whole milk
other vegetables, ticketNumber, whole milk-->sausage
ticketNumber, sausage-->other vegetables, whole milk
soda, ticketNumber, whole milk-->yogurt
ticketNumber, whole milk, yogurt-->other vegetables
other vegetables, tick

**11/** Identifier toutes les règles d’association présentant une <u>**forte association positive entre l’antécédent et le conséquent**</u>, où <u>le conséquent comprend au moins deux articles</u>, en termes de <font color=red>**levrage**</font>, qui doit être <u>strictement positif</u>, et une <font color=red>**conviction**</font> <u>supérieure ou égale à $1.05$</u>, à partir de $rules\_p$. Enregistrer le résultat dans un DataFrame nommé $rules\_f$

In [None]:
_2_items =rules_p[rules_p['consequents'].apply(lambda x: len(x)>=2)]
rules_f=_2_items[(_2_items['leverage']>0) & (_2_items['conviction']>1.05)].reset_index(drop=True)
rules_f

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(bottled water),"(other vegetables, ticketNumber, whole milk)",0.227845,0.204048,0.059902,0.262905,1.288447,0.01341,1.07985,0.289931
1,(bottled water),"(other vegetables, whole milk)",0.227845,0.204048,0.059902,0.262905,1.288447,0.01341,1.07985,0.289931
2,"(bottled water, ticketNumber)","(other vegetables, whole milk)",0.227845,0.204048,0.059902,0.262905,1.288447,0.01341,1.07985,0.289931
3,"(other vegetables, whole milk)","(bottled water, ticketNumber)",0.204048,0.227845,0.059902,0.293566,1.288447,0.01341,1.093032,0.281263
4,"(bottled water, whole milk)","(other vegetables, ticketNumber)",0.119803,0.401532,0.059902,0.5,1.245232,0.011797,1.196937,0.223741
5,"(other vegetables, whole milk)","(ticketNumber, yogurt)",0.204048,0.301696,0.076586,0.375335,1.244084,0.015026,1.117886,0.246492
6,(yogurt),"(other vegetables, ticketNumber, whole milk)",0.301696,0.204048,0.076586,0.253853,1.244084,0.015026,1.06675,0.280961
7,"(ticketNumber, yogurt)","(other vegetables, whole milk)",0.301696,0.204048,0.076586,0.253853,1.244084,0.015026,1.06675,0.280961
8,(yogurt),"(other vegetables, whole milk)",0.301696,0.204048,0.076586,0.253853,1.244084,0.015026,1.06675,0.280961
9,"(ticketNumber, sausage)","(whole milk, rolls/buns)",0.219639,0.190372,0.051969,0.236613,1.242897,0.010156,1.060573,0.250433


**12/**	Étudier <u>**le choix du support minimal**</u> de manière à obtenir <u>**une ou plusieurs règles d’association intéressante(s)**</u> ayant un <font color=red>**lift**</font> supérieur à $1$

In [None]:
c=np.arange(1,0,-0.1)
for i in c:
  try:
    rules=association_rules(apriori(trans,min_support=i,use_colnames=True),metric="lift",min_threshold=1)
    if len(rules)>=1:
      print("s_min={:.2f}".format(i))
      break

  except:
    pass
else:
  print(None)




s_min=0.40
