# Packages

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MultiLabelBinarizer

# Data Preprocessing

In [4]:
df = pd.read_excel(
    "/Users/yola.kamalita/Documents/Project/market-basket-analysis/dataset/online_retail_II.xlsx",
    engine="openpyxl"
)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      525461 non-null  object        
 1   StockCode    525461 non-null  object        
 2   Description  522533 non-null  object        
 3   Quantity     525461 non-null  int64         
 4   InvoiceDate  525461 non-null  datetime64[ns]
 5   Price        525461 non-null  float64       
 6   Customer ID  417534 non-null  float64       
 7   Country      525461 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 32.1+ MB


In [6]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [7]:
df["StockCode"].nunique()

4632

In [8]:
df["Description"].nunique()

4681

In [9]:
# Remove NAs

df_prep = df.dropna()

In [10]:
# Remove Quantity < 0

df_prep = df_prep[df_prep['Quantity'] > 0]

In [11]:
# Remove Trailing Spaces

df_prep['Description'] = df_prep['Description'].apply(lambda x: x.rstrip())

In [12]:
# Check rown with remaining Trailing Spaces

df_prep[df_prep['Description'].str.endswith(" ")]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country


In [13]:
# Create List of Products for each Invoice

df_trx = df_prep.groupby("Invoice")["Description"].apply(list).reset_index()

In [14]:
df_trx.head(10)

Unnamed: 0,Invoice,Description
0,489434,"[15CM CHRISTMAS GLASS BALL 20 LIGHTS, PINK CHE..."
1,489435,"[CAT BOWL, DOG BOWL , CHASING BALL DESIGN, HEA..."
2,489436,"[DOOR MAT BLACK FLOCK, LOVE BUILDING BLOCK WOR..."
3,489437,"[CHRISTMAS CRAFT HEART DECORATIONS, CHRISTMAS ..."
4,489438,"[DINOSAURS WRITING SET, SET OF MEADOW FLOWER..."
5,489439,"[CHRISTMAS PUDDING TRINKET POT, BAKING SET 9 P..."
6,489440,"[CAT BOWL, DOG BOWL , CHASING BALL DESIGN]"
7,489441,"[BIRD DECORATION RED SPOT, BAKING SET 9 PIECE ..."
8,489442,"[UNION JACK GUNS & ROSES DOORMAT, SCOTTIE DOG..."
9,489443,"[RETRO RED SPOTTY WASHING UP GLOVES, SET/2 RED..."


In [15]:
df_trx = df_trx.set_index('Invoice')

df_trx.head()

Unnamed: 0_level_0,Description
Invoice,Unnamed: 1_level_1
489434,"[15CM CHRISTMAS GLASS BALL 20 LIGHTS, PINK CHE..."
489435,"[CAT BOWL, DOG BOWL , CHASING BALL DESIGN, HEA..."
489436,"[DOOR MAT BLACK FLOCK, LOVE BUILDING BLOCK WOR..."
489437,"[CHRISTMAS CRAFT HEART DECORATIONS, CHRISTMAS ..."
489438,"[DINOSAURS WRITING SET, SET OF MEADOW FLOWER..."


In [16]:
# Encode List of Products for each Invoice

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Transform the column
df_trx_encode = mlb.fit_transform(df_trx["Description"])

# Convert to DataFrame with proper column names
df_trx_encode = pd.DataFrame(df_trx_encode, columns=mlb.classes_, index=df_trx.index)

In [17]:
df_trx_encode.head()

Unnamed: 0_level_0,DOORMAT UNION JACK GUNS AND ROSES,3 STRIPEY MICE FELTCRAFT,4 PURPLE FLOCK DINNER CANDLES,ANIMAL STICKERS,BLACK PIRATE TREASURE CHEST,BROWN PIRATE TREASURE CHEST,Bank Charges,CAMPHOR WOOD PORTOBELLO MUSHROOM,CHERRY BLOSSOM DECORATIVE FLASK,FAIRY CAKE CANDLES,...,ZINC HEART LATTICE CHARGER LARGE,ZINC HEART LATTICE CHARGER SMALL,ZINC HEART LATTICE DOUBLE PLANTER,ZINC HEART LATTICE PLANTER BOWL,ZINC HEART LATTICE T-LIGHT HOLDER,ZINC HEART LATTICE TRAY OVAL,ZINC METAL HEART DECORATION,ZINC POLICE BOX LANTERN,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489435,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489437,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489438,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Number of transactions per product
df_trx_encode.sum().head(30).sort_values(ascending=False)

10 COLOUR SPACEBOY PEN                 247
 WHITE CHERRY LIGHTS                   215
 SET 2 TEA TOWELS I LOVE LONDON        169
 RED/WHITE DOT MINI CASES              138
 3 STRIPEY MICE FELTCRAFT              112
  DOORMAT UNION JACK GUNS AND ROSES     51
 FLAMINGO LIGHTS                        47
 PEACE WOODEN BLOCK LETTERS             47
 SILVER CHERRY LIGHTS                   35
 HOME SWEET HOME  BLACKBOARD            33
 CHERRY BLOSSOM  DECORATIVE FLASK       26
 IVORY PAPER CUP CAKE CASES             23
 VINTAGE DESIGN GIFT TAGS               20
 4 PURPLE FLOCK DINNER CANDLES          17
 PAINT YOUR OWN CANVAS SET              17
 SILVER T-LIGHT SETTING                 15
 BLACK PIRATE TREASURE CHEST            13
 RIDGED GLASS T-LIGHT HOLDER            12
 ANIMAL STICKERS                        12
 OVAL WALL MIRROR DIAMANTE              10
 CAMPHOR WOOD PORTOBELLO MUSHROOM        7
 WHITE BAMBOO RIBS LAMPSHADE             7
 BROWN  PIRATE TREASURE CHEST            7
 STAR  T-LI

# Modelling

In [19]:
from mlxtend.frequent_patterns import apriori

In [28]:
frequent_itemsets = apriori(df_trx_encode, min_support=0.025, use_colnames=True) 



In [29]:
frequent_itemsets.sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
104,0.157221,(WHITE HANGING HEART T-LIGHT HOLDER)
86,0.087796,(REGENCY CAKESTAND 3 TIER)
97,0.069477,(STRAWBERRY CERAMIC TRINKET BOX)
4,0.069373,(ASSORTED COLOUR BIRD ORNAMENT)
30,0.061098,(HOME BUILDING BLOCK WORD)
...,...,...
64,0.025501,(PAPER CHAIN KIT RETRO SPOT)
43,0.025449,(KNITTED UNION FLAG HOT WATER BOTTLE)
55,0.025137,(PACK 20 ENGLISH ROSE PAPER NAPKINS)
27,0.025137,(HEART IVORY TRELLIS LARGE)


In [30]:
frequent_itemsets[frequent_itemsets["itemsets"].apply(len) >= 2]

Unnamed: 0,support,itemsets
111,0.025553,"(HEART OF WICKER LARGE, HEART OF WICKER SMALL)"
112,0.027166,"(HOME BUILDING BLOCK WORD, LOVE BUILDING BLOCK..."
113,0.037575,"(WHITE HANGING HEART T-LIGHT HOLDER, RED HANGI..."
114,0.032371,"(STRAWBERRY CERAMIC TRINKET BOX, SWEETHEART CE..."
115,0.028832,"(WOODEN FRAME ANTIQUE WHITE, WOODEN PICTURE FR..."


In [33]:
from mlxtend.frequent_patterns import association_rules

In [43]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7) 

In [44]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(RED HANGING HEART T-LIGHT HOLDER),(WHITE HANGING HEART T-LIGHT HOLDER),0.05173,0.157221,0.037575,0.726358,4.619984,1.0,0.029442,3.079862,0.826294,0.219253,0.67531,0.482676
1,(SWEETHEART CERAMIC TRINKET BOX),(STRAWBERRY CERAMIC TRINKET BOX),0.04205,0.069477,0.032371,0.769802,11.079959,1.0,0.029449,4.042272,0.949682,0.408941,0.752614,0.61786


Evaluation:
- Grouping similar products. How?