# Task 1 Pattern Mining

### Data Processing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Download raw data
data = pd.read_csv('data/Groceries data train.csv')

In [3]:
data

Unnamed: 0,Member_number,Date,itemDescription,year,month,day,day_of_week
0,3021,30/01/2015,frankfurter,2015,1,30,4
1,1292,24/10/2015,pork,2015,10,24,5
2,4206,4/04/2014,root vegetables,2014,4,4,4
3,4369,25/08/2015,onions,2015,8,25,1
4,1522,1/07/2014,waffles,2014,7,1,1
...,...,...,...,...,...,...,...
26995,4206,14/01/2014,rolls/buns,2014,1,14,1
26996,1660,15/06/2014,domestic eggs,2014,6,15,6
26997,1595,10/05/2015,liver loaf,2015,5,10,6
26998,3231,31/03/2015,sausage,2015,3,31,1


In [4]:
data.columns

Index(['Member_number', 'Date', 'itemDescription', 'year', 'month', 'day',
       'day_of_week'],
      dtype='object')

In [5]:
# Group the data by 'Member_number' and 'Date', then aggregate the 'itemDescription' into lists
transactions_data = data.groupby(['Member_number'])['itemDescription'].apply(list).reset_index()

# Extract the list of transactions
transactions = transactions_data['itemDescription'].tolist()

sorted_transactions = [sorted(list(set(transaction))) for transaction in transactions]

transactions = sorted_transactions

transactions = [transaction for transaction in transactions if len(transaction) > 1]

# Display the first transaction as an example
transactions[0]

['canned beer',
 'misc. beverages',
 'pastry',
 'pickled vegetables',
 'sausage',
 'soda',
 'yogurt']

In [6]:
print(type(transactions))

<class 'list'>


### Frequent Patterns Mining by Apriori Algorithm

In [7]:
pip install apyori

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [8]:
from apyori import apriori
from mlxtend.frequent_patterns import association_rules
rules = apriori(transactions, min_support=0.002, min_confidence=0.8,min_lift=1, min_length=1)
Results = list(rules)
Results

[RelationRecord(items=frozenset({'whole milk', 'bottled water', 'canned vegetables'}), support=0.002425222312045271, ordered_statistics=[OrderedStatistic(items_base=frozenset({'bottled water', 'canned vegetables'}), items_add=frozenset({'whole milk'}), confidence=0.8181818181818182, lift=2.2760665121984465)]),
 RelationRecord(items=frozenset({'spices', 'whole milk', 'bottled water'}), support=0.002425222312045271, ordered_statistics=[OrderedStatistic(items_base=frozenset({'spices', 'bottled water'}), items_add=frozenset({'whole milk'}), confidence=0.8181818181818182, lift=2.2760665121984465)]),
 RelationRecord(items=frozenset({'whole milk', 'butter', 'dishes'}), support=0.002155753166262463, ordered_statistics=[OrderedStatistic(items_base=frozenset({'butter', 'dishes'}), items_add=frozenset({'whole milk'}), confidence=1.0, lift=2.7818590704647677)]),
 RelationRecord(items=frozenset({'whole milk', 'canned fish', 'newspapers'}), support=0.002425222312045271, ordered_statistics=[OrderedSt

In [9]:
df_results = pd.DataFrame(Results)
df_results

Unnamed: 0,items,support,ordered_statistics
0,"(whole milk, bottled water, canned vegetables)",0.002425,"[((bottled water, canned vegetables), (whole m..."
1,"(spices, whole milk, bottled water)",0.002425,"[((spices, bottled water), (whole milk), 0.818..."
2,"(whole milk, butter, dishes)",0.002156,"[((butter, dishes), (whole milk), 1.0, 2.78185..."
3,"(whole milk, canned fish, newspapers)",0.002425,"[((canned fish, newspapers), (whole milk), 0.8..."
4,"(white bread, whole milk, napkins)",0.004581,"[((white bread, napkins), (whole milk), 0.8095..."
...,...,...,...
81,"(bottled water, pip fruit, other vegetables, y...",0.002695,"[((other vegetables, yogurt, bottled water, pi..."
82,"(bottled water, rolls/buns, whole milk, soda, ...",0.002425,"[((whipped/sour cream, soda, bottled water, ro..."
83,"(rolls/buns, shopping bags, whole milk, soda, ...",0.002695,"[((shopping bags, soda, canned beer, rolls/bun..."
84,"(rolls/buns, frozen vegetables, other vegetabl...",0.002425,"[((sausage, yogurt, other vegetables, frozen v..."


In [10]:
# keep support in a separate data
support = df_results.support

In [11]:
#all four empty list which will contain lhs, rhs, confidance and lift respectively.
first_values = []
second_values = []
third_values = []
fourth_value = []

# loop number of rows time and append 1 by 1 value in a separate list
# first and second element was frozenset which need to be converted in list
for i in range(df_results.shape[0]):
    single_list = df_results['ordered_statistics'][i][0]
    first_values.append(list(single_list[0]))
    second_values.append(list(single_list[1]))
    third_values.append(single_list[2])
    fourth_value.append(single_list[3])

In [12]:
# convert all four list into dataframe for further operation
lhs = pd.DataFrame(first_values)
rhs = pd.DataFrame(second_values)

confidance=pd.DataFrame(third_values,columns=['Confidance'])

lift=pd.DataFrame(fourth_value,columns=['lift'])

In [13]:
# concat all list together in a single dataframe
df_final = pd.concat([lhs,rhs,support,confidance,lift], axis=1)
df_final
df_final.fillna(value=' ', inplace=True)
df_final.head()

Unnamed: 0,0,1,2,3,0.1,support,Confidance,lift
0,bottled water,canned vegetables,,,whole milk,0.002425,0.818182,2.276067
1,spices,bottled water,,,whole milk,0.002425,0.818182,2.276067
2,butter,dishes,,,whole milk,0.002156,1.0,2.781859
3,canned fish,newspapers,,,whole milk,0.002425,0.818182,2.276067
4,white bread,napkins,,,whole milk,0.004581,0.809524,2.251981


In [14]:
#set column name
df_final.columns = ['lhs',1,2,3,'rhs','support','confidance','lift']
df_final.head()

Unnamed: 0,lhs,1,2,3,rhs,support,confidance,lift
0,bottled water,canned vegetables,,,whole milk,0.002425,0.818182,2.276067
1,spices,bottled water,,,whole milk,0.002425,0.818182,2.276067
2,butter,dishes,,,whole milk,0.002156,1.0,2.781859
3,canned fish,newspapers,,,whole milk,0.002425,0.818182,2.276067
4,white bread,napkins,,,whole milk,0.004581,0.809524,2.251981


In [15]:
# add all three column to lhs itemset only
df_final['lhs'] = df_final['lhs'] + str(", ")  + df_final[1] + str(", ") + df_final[2] + df_final[3]

df_final['rhs'] = df_final['rhs'] + str(", ")

In [16]:
df_final.drop(columns=[1,2,3],inplace=True)

In [17]:
df_final.sort_values('lift', ascending=False).head(10)

Unnamed: 0,lhs,rhs,support,confidance,lift
27,"whipped/sour cream, waffles, bottled water","tropical fruit,",0.002156,0.888889,5.29481
21,"specialty chocolate, bottled water, canned beer","root vegetables,",0.002156,0.888889,5.051557
32,"whole milk, candy, pork","yogurt,",0.002156,0.888889,4.175527
77,"whole milk, berries, other vegetablestropical ...","yogurt,",0.002425,0.818182,3.843383
6,"soda, frankfurter, UHT-milk","rolls/buns,",0.002156,1.0,3.666996
59,"sausage, frankfurter, pastry","soda,",0.002425,0.818182,3.362428
35,"shopping bags, canned beer, domestic eggs","soda,",0.002425,0.818182,3.362428
84,"sausage, yogurt, other vegetablesfrozen vegeta...","rolls/buns,",0.002425,0.9,3.300296
44,"whole milk, frozen meals, citrus fruit","rolls/buns,",0.002695,0.833333,3.05583
60,"whole milk, frozen meals, frozen vegetables","other vegetables,",0.002156,0.888889,3.037446


Note: The code above applies the Apriori algorithm for pattern mining on all transactions and stores the results in a list named Results. These results can be visualized in a tabular format for better analysis and interpretation of the association rules.

### Prediction Functions Based on Discovered Patterns (Ready for Import to Task 2)

In [19]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

In [20]:
df

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,True,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3706,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3707,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3708,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3709,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [21]:
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd

frequent_itemsets = apriori(df, min_support=0.002, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Instant food products),(root vegetables),0.011318,0.175963,0.004581,0.404762,2.300263,0.002589,1.384382,0.571738
1,(Instant food products),(soda),0.011318,0.243331,0.004850,0.428571,1.761272,0.002097,1.324171,0.437176
2,(Instant food products),(whole milk),0.011318,0.359472,0.004581,0.404762,1.125991,0.000513,1.076087,0.113174
3,(UHT-milk),(whole milk),0.054702,0.359472,0.023713,0.433498,1.205929,0.004049,1.130671,0.180646
4,(artif. sweetener),(other vegetables),0.005659,0.292643,0.002425,0.428571,1.464483,0.000769,1.237874,0.318970
...,...,...,...,...,...,...,...,...,...,...
5438,"(sausage, whole milk, soda, root vegetables)",(yogurt),0.004850,0.212881,0.002156,0.444444,2.087764,0.001123,1.416815,0.523558
5439,"(sausage, yogurt, whipped/sour cream, whole milk)",(soda),0.005389,0.243331,0.002695,0.500000,2.054817,0.001383,1.513339,0.516120
5440,"(sausage, yogurt, soda, whipped/sour cream)",(whole milk),0.003503,0.359472,0.002695,0.769231,2.139892,0.001435,2.775622,0.534559
5441,"(whole milk, yogurt, soda, whipped/sour cream)",(sausage),0.005389,0.153867,0.002695,0.500000,3.249562,0.001865,1.692266,0.696017


In [22]:
# frequent_itemsets = fpgrowth(df, min_support=0.002, use_colnames=True)
# frequent_itemsets

# from mlxtend.frequent_patterns import association_rules
# rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)
# rules

In [23]:
def predict_items(purchased_items):
    # create a list to store the predicted items
    predicted_items = []
    
    # iterate over the top rules
    for index, row in rules.iterrows():
        # get the items in the antecedent and consequent of the rule
        antecedent = row["antecedents"]
        consequent = row["consequents"]
        
        # check if all the items in the antecedent are in the purchased items
        if antecedent.issubset(purchased_items):
            # add the items in the consequent to the predicted items
            for item in consequent:
                if item not in purchased_items and item not in predicted_items:
                    predicted_items.append(item)
    
    # return the list of predicted items
    return predicted_items

In [24]:
# make a prediction for a sample set of purchased items
purchased_items = {"yogurt", "soda"}
predicted_items = predict_items(purchased_items)

# print the predicted items to the console
print(predicted_items)

['whole milk']


In [25]:
# make a prediction for a sample set of purchased items
purchased_items = {'canned beer',
 'misc. beverages',
 'pastry',
 'pickled vegetables',
 'sausage',
 'soda',
 'yogurt'}
predicted_items = predict_items(purchased_items)

# print the predicted items to the console
print(predicted_items)

['whole milk', 'rolls/buns', 'other vegetables', 'pip fruit']


Note: The code above prepares prediction functions based on discovered patterns, which can be used for Task 2. It includes implementations of both the Apriori and FP-Growth algorithms for pattern mining (Apriori is the default algorithm, and the part of the code for FP-Growth is currently commented out with a hash symbol). These functions can be imported and utilized as needed.