# **Setup**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# **Data Import**

In [4]:
df = pd.read_csv('market_basket_dataset.csv')

print(df.shape)
df.head()

(500, 5)


Unnamed: 0,BillNo,Itemname,Quantity,Price,CustomerID
0,1000,Apples,5,8.3,52299
1,1000,Butter,4,6.06,11752
2,1000,Eggs,4,2.66,16415
3,1000,Potatoes,4,8.1,22889
4,1004,Oranges,2,7.26,52255


# **Data Validation**

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   BillNo      500 non-null    int64  
 1   Itemname    500 non-null    object 
 2   Quantity    500 non-null    int64  
 3   Price       500 non-null    float64
 4   CustomerID  500 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 19.7+ KB


In [10]:
df.describe()

Unnamed: 0,BillNo,Quantity,Price,CustomerID
count,500.0,500.0,500.0,500.0
mean,1247.442,2.978,5.61766,54229.8
std,144.483097,1.426038,2.572919,25672.122585
min,1000.0,1.0,1.04,10504.0
25%,1120.0,2.0,3.57,32823.5
50%,1246.5,3.0,5.43,53506.5
75%,1370.0,4.0,7.92,76644.25
max,1497.0,5.0,9.94,99162.0


In [11]:
df.isna().sum()

BillNo        0
Itemname      0
Quantity      0
Price         0
CustomerID    0
dtype: int64

In [12]:
# Unique Items
df['Itemname'].unique()

array(['Apples', 'Butter', 'Eggs', 'Potatoes', 'Oranges', 'Milk',
       'Onions', 'Cereal', 'Tomatoes', 'Bananas', 'Pasta', 'Bread',
       'Coffee', 'Sugar', 'Chicken', 'Cheese', 'Tea', 'Yogurt', 'Juice'],
      dtype=object)

In [14]:
# Unique Customer IDs
len(df['CustomerID'].unique())

499

# **Data Exploration**

## **Item Wise Sales Quantity**

In [27]:
px.histogram(df,x='Itemname',y='Quantity')

## **Price Consistency Check**

In [28]:
df.head()

Unnamed: 0,BillNo,Itemname,Quantity,Price,CustomerID
0,1000,Apples,5,8.3,52299
1,1000,Butter,4,6.06,11752
2,1000,Eggs,4,2.66,16415
3,1000,Potatoes,4,8.1,22889
4,1004,Oranges,2,7.26,52255


In [30]:
df[df['Itemname']=='Oranges'][:5]

Unnamed: 0,BillNo,Itemname,Quantity,Price,CustomerID
4,1004,Oranges,2,7.26,52255
20,1013,Oranges,1,6.47,32140
22,1021,Oranges,3,1.13,80292
114,1112,Oranges,1,9.41,44262
126,1120,Oranges,2,6.38,71663


## **Top 10 Most Popular Items**

In [41]:
df_most_popular = df.groupby('Itemname')['Quantity'].sum().sort_values(ascending=False).reset_index()[:10]
display(df_most_popular)

px.histogram(df_most_popular,x='Itemname',y='Quantity',title='Top 10 Most Popular Items')

Unnamed: 0,Itemname,Quantity
0,Bananas,119
1,Cheese,99
2,Coffee,88
3,Cereal,87
4,Milk,86
5,Oranges,81
6,Tomatoes,79
7,Sugar,79
8,Potatoes,78
9,Bread,76


## **CustomerID Wise Mean Order QTY & Total Order Value** 

In [46]:
df_customer_data = df.groupby('CustomerID').agg({'Quantity':'mean','Price':'sum'}).reset_index()
df_customer_data.columns = ['CustomerID','Avg_Basket_Qty','Total_Basket_Value']
df_customer_data.head(10)

Unnamed: 0,CustomerID,Avg_Basket_Qty,Total_Basket_Value
0,10504,1.0,2.04
1,10588,5.0,5.5
2,10826,1.0,5.67
3,11113,3.0,8.84
4,11267,1.0,8.87
5,11373,2.0,6.69
6,11430,3.0,4.85
7,11644,5.0,4.67
8,11752,4.0,6.06
9,11754,3.0,1.18


In [48]:
px.scatter(df_customer_data,x='CustomerID',y='Total_Basket_Value')

# **Data Analysis - Apriori**

In [49]:
df.head()

Unnamed: 0,BillNo,Itemname,Quantity,Price,CustomerID
0,1000,Apples,5,8.3,52299
1,1000,Butter,4,6.06,11752
2,1000,Eggs,4,2.66,16415
3,1000,Potatoes,4,8.1,22889
4,1004,Oranges,2,7.26,52255


In [56]:
df_basket = df.groupby('BillNo')['Itemname'].apply(list).reset_index()
df_basket.head()

Unnamed: 0,BillNo,Itemname
0,1000,"[Apples, Butter, Eggs, Potatoes]"
1,1004,[Oranges]
2,1005,"[Milk, Onions, Cereal]"
3,1008,"[Tomatoes, Potatoes, Cereal]"
4,1011,[Bananas]


In [59]:
basket_encoded = df_basket['Itemname'].str.join('|').str.get_dummies(sep='|')
basket_encoded

Unnamed: 0,Apples,Bananas,Bread,Butter,Cereal,Cheese,Chicken,Coffee,Eggs,Juice,Milk,Onions,Oranges,Pasta,Potatoes,Sugar,Tea,Tomatoes,Yogurt
0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,0,1,1,1,1,1,1,1,1,1,0,0,1,1,0,0,0,0,1
149,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
150,0,1,1,1,0,0,1,0,0,1,1,0,1,0,1,0,0,0,0
151,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0


In [62]:
from mlxtend.frequent_patterns import apriori, association_rules

frequent_items = apriori(basket_encoded, min_support=0.001, use_colnames=True)
rules = association_rules(frequent_items, metric="lift", min_threshold=0.5)
rules


DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bread),(Apples),0.150327,0.163399,0.045752,0.304348,1.862609,0.021188,1.202614,0.545055
1,(Apples),(Bread),0.163399,0.150327,0.045752,0.280000,1.862609,0.021188,1.180102,0.553571
2,(Apples),(Butter),0.163399,0.163399,0.026144,0.160000,0.979200,-0.000555,0.995954,-0.024762
3,(Butter),(Apples),0.163399,0.163399,0.026144,0.160000,0.979200,-0.000555,0.995954,-0.024762
4,(Cereal),(Apples),0.202614,0.163399,0.019608,0.096774,0.592258,-0.013499,0.926237,-0.463343
...,...,...,...,...,...,...,...,...,...,...
2384801,(Onions),"(Yogurt, Milk, Coffee, Juice, Potatoes, Pasta,...",0.150327,0.006536,0.006536,0.043478,6.652174,0.005553,1.038622,1.000000
2384802,(Eggs),"(Yogurt, Milk, Coffee, Juice, Potatoes, Pasta,...",0.176471,0.006536,0.006536,0.037037,5.666667,0.005383,1.031674,1.000000
2384803,(Tomatoes),"(Yogurt, Milk, Coffee, Juice, Potatoes, Pasta,...",0.176471,0.006536,0.006536,0.037037,5.666667,0.005383,1.031674,1.000000
2384804,(Tea),"(Yogurt, Milk, Coffee, Juice, Potatoes, Pasta,...",0.150327,0.006536,0.006536,0.043478,6.652174,0.005553,1.038622,1.000000


# **Conclusion**

In [66]:
rules[['antecedents','consequents','support','confidence','lift']].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(Bread),(Apples),0.045752,0.304348,1.862609
1,(Apples),(Bread),0.045752,0.28,1.862609
2,(Apples),(Butter),0.026144,0.16,0.9792
3,(Butter),(Apples),0.026144,0.16,0.9792
4,(Cereal),(Apples),0.019608,0.096774,0.592258
5,(Apples),(Cereal),0.019608,0.12,0.592258
6,(Apples),(Cheese),0.039216,0.24,1.311429
7,(Cheese),(Apples),0.039216,0.214286,1.311429
8,(Apples),(Chicken),0.03268,0.2,1.53
9,(Chicken),(Apples),0.03268,0.25,1.53
