In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mlxtend
from mlxtend.frequent_patterns import association_rules, apriori

## Simple Coding Apriori Algorithm (For Learning)

In [21]:
fp_df = pd.read_csv("./DMBA/dataset/Faceplate.csv")
fp_df.set_index("Transaction", inplace=True)

print(fp_df.info())
fp_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 1 to 10
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Red     10 non-null     int64
 1   White   10 non-null     int64
 2   Blue    10 non-null     int64
 3   Orange  10 non-null     int64
 4   Green   10 non-null     int64
 5   Yellow  10 non-null     int64
dtypes: int64(6)
memory usage: 560.0 bytes
None


Unnamed: 0_level_0,Red,White,Blue,Orange,Green,Yellow
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,0,0,1,0
2,0,1,0,1,0,0
3,0,1,1,0,0,0
4,1,1,0,1,0,0
5,1,0,1,0,0,0


In [22]:
for c in fp_df.columns:
    fp_df[c] = fp_df[c].astype('bool')

# Create frequent itemset
itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)

# Convert into rules
rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)


rules.sort_values(by=['lift'], ascending=False)\
    .drop(columns=['antecedent support', 'consequent support', 'conviction'])\
    .head(6)

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage,zhangs_metric
13,"(Red, White)",(Green),0.2,0.5,2.5,0.12,1.0
15,(Green),"(Red, White)",0.2,1.0,2.5,0.12,0.75
4,(Green),(Red),0.2,1.0,1.666667,0.08,0.5
14,"(Green, White)",(Red),0.2,1.0,1.666667,0.08,0.5
7,(Orange),(White),0.2,1.0,1.428571,0.06,0.375
8,(Green),(White),0.2,1.0,1.428571,0.06,0.375


In [24]:
itemsets

Unnamed: 0,support,itemsets
0,0.6,(Red)
1,0.7,(White)
2,0.6,(Blue)
3,0.2,(Orange)
4,0.2,(Green)
5,0.4,"(Red, White)"
6,0.4,"(Red, Blue)"
7,0.2,"(Red, Green)"
8,0.4,"(White, Blue)"
9,0.2,"(Orange, White)"


In [23]:
rules.head(50)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Red),(White),0.6,0.7,0.4,0.666667,0.952381,-0.02,0.9,-0.111111
1,(White),(Red),0.7,0.6,0.4,0.571429,0.952381,-0.02,0.933333,-0.142857
2,(Red),(Blue),0.6,0.6,0.4,0.666667,1.111111,0.04,1.2,0.25
3,(Blue),(Red),0.6,0.6,0.4,0.666667,1.111111,0.04,1.2,0.25
4,(Green),(Red),0.2,0.6,0.2,1.0,1.666667,0.08,inf,0.5
5,(White),(Blue),0.7,0.6,0.4,0.571429,0.952381,-0.02,0.933333,-0.142857
6,(Blue),(White),0.6,0.7,0.4,0.666667,0.952381,-0.02,0.9,-0.111111
7,(Orange),(White),0.2,0.7,0.2,1.0,1.428571,0.06,inf,0.375
8,(Green),(White),0.2,0.7,0.2,1.0,1.428571,0.06,inf,0.375
9,"(Red, White)",(Blue),0.4,0.6,0.2,0.5,0.833333,-0.04,0.8,-0.25


Intepreting results:
- Support ==> Indicates its impact in terms of overall size.
> How many transactions are affected? If only a small number of transactions are affected, the rule may be of little use (unless the consequent is very valuable and/or the rule is very efficient in finding it).

- Lift ==> Indicates how efficient the rule is finding consequents, compared to random selection.

NOTE: A very efficient rule that has very low support may not be as desirable as a less efficient rule with much greater support.

- Confidence ==> The confidence tells us at what rate consequents will be found, and is useful in determining the business or operational usefulness of a rule.

> A rule with low confidence may find consequents at too low a rate to be worth the cost of (say) promoting the consequent in all the transactions that involve the antecedent.

## Example: Rules for Similar Book Purchases

In [13]:
# Load dataset
all_books_df = pd.read_csv('./DMBA/dataset/CharlesBookClub.csv')

print(all_books_df.info())
all_books_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Seq#              4000 non-null   int64
 1   ID#               4000 non-null   int64
 2   Gender            4000 non-null   int64
 3   M                 4000 non-null   int64
 4   R                 4000 non-null   int64
 5   F                 4000 non-null   int64
 6   FirstPurch        4000 non-null   int64
 7   ChildBks          4000 non-null   int64
 8   YouthBks          4000 non-null   int64
 9   CookBks           4000 non-null   int64
 10  DoItYBks          4000 non-null   int64
 11  RefBks            4000 non-null   int64
 12  ArtBks            4000 non-null   int64
 13  GeogBks           4000 non-null   int64
 14  ItalCook          4000 non-null   int64
 15  ItalAtlas         4000 non-null   int64
 16  ItalArt           4000 non-null   int64
 17  Florence          4000 non-null  

Unnamed: 0,Seq#,ID#,Gender,M,R,F,FirstPurch,ChildBks,YouthBks,CookBks,...,ItalCook,ItalAtlas,ItalArt,Florence,Related Purchase,Mcode,Rcode,Fcode,Yes_Florence,No_Florence
0,1,25,1,297,14,2,22,0,1,1,...,0,0,0,0,0,5,4,2,0,1
1,2,29,0,128,8,2,10,0,0,0,...,0,0,0,0,0,4,3,2,0,1
2,3,46,1,138,22,7,56,2,1,2,...,1,0,0,0,2,4,4,3,0,1
3,4,47,1,228,2,1,2,0,0,0,...,0,0,0,0,0,5,1,1,0,1
4,5,51,1,257,10,1,10,0,0,0,...,0,0,0,0,0,5,3,1,0,1


In [19]:
# Creae the binary incidence matrix
ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',
          'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']

count_books = all_books_df.drop(columns=ignore)
count_books[count_books > 0] = 1

for c in count_books.columns:
    count_books[c] = count_books[c].astype('bool')
    
# Create frequent itemsets and rules
itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)
rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)

# Display 25 rules with highest lift
rules.sort_values(by=['lift'], ascending=False).head(25)[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage']]

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
64,"(RefBks, YouthBks)","(ChildBks, CookBks)",0.05525,0.68,2.809917,0.035588
73,"(DoItYBks, RefBks)","(ChildBks, CookBks)",0.06125,0.662162,2.736207,0.038865
60,"(DoItYBks, YouthBks)","(ChildBks, CookBks)",0.067,0.64891,2.681448,0.042014
80,"(RefBks, GeogBks)","(ChildBks, CookBks)",0.05025,0.614679,2.539995,0.030467
69,"(YouthBks, GeogBks)","(ChildBks, CookBks)",0.06325,0.605263,2.501087,0.037961
77,"(DoItYBks, GeogBks)","(ChildBks, CookBks)",0.0605,0.59901,2.475248,0.036058
65,"(ChildBks, CookBks, GeogBks)",(YouthBks),0.06325,0.577626,2.424452,0.037162
72,"(RefBks, ChildBks, CookBks)",(DoItYBks),0.06125,0.591787,2.323013,0.034883
47,"(DoItYBks, GeogBks)",(YouthBks),0.0545,0.539604,2.264864,0.030437
61,"(RefBks, ChildBks, CookBks)",(YouthBks),0.05525,0.533816,2.240573,0.030591
