# Packages

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MultiLabelBinarizer

# Data Preprocessing

In [2]:
df = pd.read_excel(
    "/Users/yola.kamalita/Documents/Project/market-basket-analysis/dataset/online_retail_II.xlsx",
    engine="openpyxl"
)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      525461 non-null  object        
 1   StockCode    525461 non-null  object        
 2   Description  522533 non-null  object        
 3   Quantity     525461 non-null  int64         
 4   InvoiceDate  525461 non-null  datetime64[ns]
 5   Price        525461 non-null  float64       
 6   Customer ID  417534 non-null  float64       
 7   Country      525461 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 32.1+ MB


In [4]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [5]:
df["StockCode"].nunique()

4632

In [6]:
df["Description"].nunique()

4681

In [8]:
# Remove NAs

df_prep = df.dropna()

In [9]:
# Remove Quantity <= 0

df_prep = df_prep[df_prep['Quantity'] > 0]

In [10]:
# Remove Trailing Spaces

df_prep['Description'] = df_prep['Description'].apply(lambda x: x.rstrip())

In [11]:
# Check rown with remaining Trailing Spaces

df_prep[df_prep['Description'].str.endswith(" ")]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country


## Create new product groups (more general)

In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yola.kamalita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yola.kamalita/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yola.kamalita/nltk_data...


True

In [13]:
# Load BERT-based model
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('all-mpnet-base-v2')

# Product names
products = df_prep['Description'].unique().tolist()

# Convert product names into embeddings
embeddings = model.encode(products)


In [26]:
# Compute cosine similarity
cosine_sim = cosine_similarity(embeddings)

# Perform clustering
clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='complete', distance_threshold=0.5)
labels = clustering.fit_predict(1 - cosine_sim)  # Convert similarity to distance

# Print grouped products
product_groups = {}
for i, label in enumerate(labels):
    product_groups.setdefault(label, []).append(products[i])

# Stopwords to ignore
stop_words = set(stopwords.words("english")).union({"by", "for", "with", "the", "and", "of", "a", "to", "on", "in"})

# Initialize NLTK's WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text (tokenization, stopword removal, lemmatization)
def preprocess_text(text):
    words = word_tokenize(text.lower())  # Convert to lowercase & tokenize
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]  # Lemmatize & remove stopwords
    return " ".join(words)

# Function to get representative name using TF-IDF
def get_representative_name(product_list):
    processed_products = [preprocess_text(p) for p in product_list]  # Preprocess product names
    
    # Compute TF-IDF scores
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_products)
    
    # Find the highest scoring words
    feature_names = vectorizer.get_feature_names_out()
    scores = np.array(tfidf_matrix.sum(axis=0)).flatten()  # Sum TF-IDF scores per word
    top_words = [feature_names[i] for i in scores.argsort()[-3:]]  # Select top 3 words
    
    return " ".join(top_words).title()  # Format as title case

# Get representative names
representative_names = {group: get_representative_name(items) for group, items in product_groups.items()}



In [28]:
# Convert dictionary to DataFrame
product_groups_df = pd.DataFrame(list(product_groups.items()), columns=['Description Group', 'Description'])
representative_names_df = pd.DataFrame(list(representative_names.items()), columns=['Description Group', 'Description New'])

# Explode list into separate rows
product_groups_df = product_groups_df.explode('Description').reset_index(drop=True)
product_groups_df = pd.merge(product_groups_df, representative_names_df, on="Description Group", how="inner") 

In [29]:
product_groups_df

Unnamed: 0,Description Group,Description,Description New
0,428,15CM CHRISTMAS GLASS BALL 20 LIGHTS,Flock Ball Christmas
1,428,4 PINK FLOCK CHRISTMAS BALLS,Flock Ball Christmas
2,428,4 GOLD FLOCK CHRISTMAS BALLS,Flock Ball Christmas
3,307,PINK CHERRY LIGHTS,Flamingo Cherry Light
4,307,WHITE CHERRY LIGHTS,Flamingo Cherry Light
...,...,...,...
4425,683,BAKING MOULD CHOCOLATE CUPCAKES,Baking Mould Chocolate
4426,683,BAKING MOULD ROSE WHITE CHOCOLATE,Baking Mould Chocolate
4427,683,BAKING MOULD ROSE MILK CHOCOLATE,Baking Mould Chocolate
4428,683,BAKING MOULD CHOCOLATE CUP CAKES,Baking Mould Chocolate


## List of Products (New)

In [30]:
# Create List of Products for each Invoice

df_prep_merged = pd.merge(df_prep, product_groups_df, on="Description", how="inner") 
df_trx = df_prep_merged.groupby("Invoice")["Description New"].apply(list).reset_index()

In [31]:
df_trx.head(10)

Unnamed: 0,Invoice,Description New
0,489434,"[Flock Ball Christmas, Flamingo Cherry Light, ..."
1,489435,"[White Cat Bowl, Ball Design Dog, Heart Measur..."
2,489436,"[Mat Black Flock, Block Building Word, Block B..."
3,489437,"[Christmas Heart Decoration, Christmas Heart D..."
4,489438,"[Dinosaur Set Writing, Flower Animal Sticker, ..."
5,489439,"[Heart Pot Trinket, Baking Retrospot Set, Pc T..."
6,489440,"[White Cat Bowl, Ball Design Dog]"
7,489441,"[Spot Bird Decoration, Baking Retrospot Set, L..."
8,489442,"[Jack Rose Union, Water Hot Bottle, Heart Ivor..."
9,489443,"[Blue Washing Glove, Red Towel Tea, Record Sin..."


In [32]:
df_trx = df_trx.set_index('Invoice')

df_trx.head()

Unnamed: 0_level_0,Description New
Invoice,Unnamed: 1_level_1
489434,"[Flock Ball Christmas, Flamingo Cherry Light, ..."
489435,"[White Cat Bowl, Ball Design Dog, Heart Measur..."
489436,"[Mat Black Flock, Block Building Word, Block B..."
489437,"[Christmas Heart Decoration, Christmas Heart D..."
489438,"[Dinosaur Set Writing, Flower Animal Sticker, ..."


In [34]:
# Encode List of Products for each Invoice

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Transform the column
df_trx_encode = mlb.fit_transform(df_trx["Description New"])

# Convert to DataFrame with proper column names
df_trx_encode = pd.DataFrame(df_trx_encode, columns=mlb.classes_, index=df_trx.index)

In [35]:
df_trx_encode.head()

Unnamed: 0_level_0,Acrylic Bangle Faceted,Acrylic Geometric Lamp,Aid First Tin,Airline Vintage Bag,Alarm Bakelike Clock,Alphabet Iron Patch,Animal Crocheted Japanese,Animal Farm Felt,Ant Bracelet Boudicca,Antique Edwardian Dresser,...,Woven Cover Cushion,Wrap Apple Red,Wrap Dolly Girl,Wrap London Love,Wrap Red Retrospot,Wreath Gingham Heart,Writing Balloon Set,Yellow Birdfeeder Chalet,Zinc Metal Heart,Zinc Stick Candle
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489435,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489437,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489438,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# Number of transactions per product
df_trx_encode.sum().head(30).sort_values(ascending=False)

Antique Wood White            2117
Aid First Tin                  579
Assorted Fridge Magnet         532
Airline Vintage Bag            507
Assorted Frutti Tutti          490
Assorted Cone Party            486
Babushka Doorstop Gingham      368
Alarm Bakelike Clock           223
Assorted Colour Flower         219
Assorted Colour Teaspoon       193
Animal Crocheted Japanese      168
As Col Sand                    157
Assorted Crawlies Creepy       148
Asstd Design Pen               132
Assorted Circular Mobile       131
Assorted Floral Secateurs      121
Alphabet Iron Patch             94
Ant Bracelet Boudicca           59
Animal Farm Felt                58
Acrylic Geometric Lamp          45
Assorted Bucket Farmyard        32
Baby Bib Carousel               30
Antique Edwardian Dresser       23
Army Camo Tape                  17
Acrylic Bangle Faceted          16
Artiifcial Flower Foxglove      13
Art Canvas Picture              13
Asstd Col                        7
Arboretum English La

# Modelling

In [37]:
from mlxtend.frequent_patterns import apriori

In [38]:
frequent_itemsets = apriori(df_trx_encode, min_support=0.025, use_colnames=True) 



In [39]:
frequent_itemsets.sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
70,0.291491,(Glass Hanging Holder)
101,0.167005,(Pack Case Cake)
177,0.166276,(Water Hot Bottle)
99,0.148686,(Oval Trinket Box)
178,0.129846,(Way Sign Metal)
...,...,...
217,0.025085,"(Chocolate Hand Sign, Laundry Metal Sign)"
184,0.025085,(Wrap London Love)
200,0.025085,"(Bag Red Retrospot, Pack Paper Napkin)"
210,0.025085,"(Blue Car Lunch, Spotty Red Bag)"


In [40]:
frequent_itemsets[frequent_itemsets["itemsets"].apply(len) >= 2]

Unnamed: 0,support,itemsets
188,0.025449,"(Antique Wood White, Block Building Word)"
189,0.058964,"(Glass Hanging Holder, Antique Wood White)"
190,0.026646,"(Antique Wood White, Laundry Metal Sign)"
191,0.026958,"(Antique Wood White, Water Hot Bottle)"
192,0.028832,"(Way Sign Metal, Antique Wood White)"
...,...,...
288,0.035597,"(Sympathy Tea Water, Water Hot Bottle)"
289,0.035337,"(Way Sign Metal, Tea Metal Sign)"
290,0.034244,"(Way Sign Metal, Water Hot Bottle)"
291,0.026698,"(Bag Red Retrospot, Design Bag Suki, Spotty Re..."


In [41]:
from mlxtend.frequent_patterns import association_rules

In [42]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5) 

In [43]:
rules.sort_values(by='support', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
24,(Laundry Metal Sign),(Way Sign Metal),0.117929,0.129846,0.067031,0.568402,4.377496,1.0,0.051718,2.016123,0.874713,0.370861,0.503998,0.542317
23,(Way Sign Metal),(Laundry Metal Sign),0.129846,0.117929,0.067031,0.516232,4.377496,1.0,0.051718,1.823337,0.886693,0.370861,0.451555,0.542317
0,(Antique Wood White),(Glass Hanging Holder),0.110174,0.291491,0.058964,0.535191,1.836047,1.0,0.02685,1.524302,0.511731,0.172058,0.343962,0.368738
25,(Paper Set Doily),(Pack Case Cake),0.075618,0.167005,0.04866,0.643496,3.853157,1.0,0.036031,2.336567,0.801046,0.250872,0.572022,0.467432
4,(Block Building Word),(Glass Hanging Holder),0.087119,0.291491,0.047983,0.550777,1.889515,1.0,0.022589,1.577186,0.51569,0.145128,0.365959,0.357695
18,(Star Heart Wicker),(Glass Hanging Holder),0.086911,0.291491,0.047671,0.548503,1.881715,1.0,0.022337,1.569244,0.51317,0.144138,0.36275,0.356023
30,(Shopper Bag Strawberry),(Spotty Red Bag),0.072339,0.126464,0.040958,0.566187,4.477072,1.0,0.031809,2.013624,0.837202,0.259479,0.503383,0.445028
14,(Flower Hanging Heart),(Glass Hanging Holder),0.064533,0.291491,0.039552,0.612903,2.102649,1.0,0.020742,1.830315,0.560586,0.124979,0.453646,0.374297
2,(Bag Design Spaceboy),(Design Bag Suki),0.066667,0.108405,0.038876,0.583138,5.379261,1.0,0.031649,2.138826,0.872251,0.285441,0.532454,0.470878
1,(Bag Design Spaceboy),(Bag Red Retrospot),0.066667,0.122508,0.036794,0.551913,4.505098,1.0,0.028627,1.958304,0.833603,0.241462,0.489354,0.426126


In [47]:
product_groups_df[product_groups_df['Description New'].isin(['Way Sign Metal','Laundry Metal Sign'])]

Unnamed: 0,Description Group,Description,Description New
118,168,AREA PATROLLED METAL SIGN,Laundry Metal Sign
119,168,BATHROOM METAL SIGN,Laundry Metal Sign
120,168,LAUNDRY 15C METAL SIGN,Laundry Metal Sign
121,168,"AIRLINE LOUNGE,METAL SIGN",Laundry Metal Sign
122,168,NO JUNK MAIL METAL SIGN,Laundry Metal Sign
123,168,POTTERING IN THE SHED METAL SIGN,Laundry Metal Sign
124,168,OPEN CLOSED METAL SIGN,Laundry Metal Sign
125,168,GARDEN METAL SIGN,Laundry Metal Sign
126,168,HOT BATHS METAL SIGN,Laundry Metal Sign
127,168,KITCHEN METAL SIGN,Laundry Metal Sign
