# Group Project - Recommender Systems

### Reading the data and importing libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, pairwise_distances
from math import sqrt
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
from sklearn.decomposition import NMF
import re

In [47]:
# Reading the dataset
# The dataset is from Online Retail, which contains transactions from a online retail store.
df = pd.read_csv("OnlineRetail.csv", sep=",", encoding="ISO-8859-1")
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France


## Data preparation

In [48]:
dataframe = df.copy()

# Check structure and null values
print(dataframe.info())
print(dataframe.isnull().sum())

# Drop rows with missing CustomerID (we need it for recommendations)
dataframe = dataframe.dropna(subset=["CustomerID"])

# Keep only positive transactions
dataframe = dataframe[dataframe['Quantity'] > 0]

# Convert InvoiceDate to datetime
dataframe['InvoiceDate'] = pd.to_datetime(dataframe['InvoiceDate'], format='%m/%d/%Y %H:%M')

# Create a new column for total price
dataframe['TotalPrice'] = dataframe['Quantity'] * dataframe['UnitPrice']

# Create a new column for the date without time
dataframe['Date'] = dataframe['InvoiceDate'].dt.date

# Create a new column for the month
dataframe['Month'] = dataframe['InvoiceDate'].dt.to_period('M')

# Create a new column for the year
dataframe['Year'] = dataframe['InvoiceDate'].dt.year

# Create a new column for the hour
dataframe['Hour'] = dataframe['InvoiceDate'].dt.hour

# Create a new column for the day of the week
dataframe['DayOfWeek'] = dataframe['InvoiceDate'].dt.day_name()

# Convert CustomerID to string
dataframe['CustomerID'] = dataframe['CustomerID'].astype(int).astype(str)
dataframe


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB
None
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,Date,Month,Year,Hour,DayOfWeek
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.30,2010-12-01,2010-12,2010,8,Wednesday
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12-01,2010-12,2010,8,Wednesday
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.00,2010-12-01,2010-12,2010,8,Wednesday
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12-01,2010-12,2010,8,Wednesday
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12-01,2010-12,2010,8,Wednesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680,France,10.20,2011-12-09,2011-12,2011,12,Friday
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680,France,12.60,2011-12-09,2011-12,2011,12,Friday
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680,France,16.60,2011-12-09,2011-12,2011,12,Friday
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680,France,16.60,2011-12-09,2011-12,2011,12,Friday


In [49]:
# Number of unique customers
num_customers = dataframe['CustomerID'].nunique()
print(f"Unique customers: {num_customers}")

# Number of unique products (items)
num_items = dataframe['StockCode'].nunique()
print(f"Unique products: {num_items}")


Unique customers: 4339
Unique products: 3665


In [None]:
#Hand encoding the categories based on keywords in the description

def contains_word(text, words):
    return any(re.search(rf'\b{word}\b', text) for word in words)

def assign_category_keyword(description):
    desc = description.lower()

    if contains_word(desc, ['light', 'lantern', 'lamp', 'candle']):
        return 'Lighting'
    elif contains_word(desc, ['mug', 'cup', 'plate', 'spoon', 'fork', 'kitchen', 'bottle']):
        return 'Kitchen'
    elif contains_word(desc, ['cushion', 'blanket', 'hottie', 'wool', 'scarf', 'hot water bottle', 'throw']):
        return 'Textiles'
    elif contains_word(desc, ['pen', 'notebook', 'pencil', 'stationery', 'sharpener', 'diary']):
        return 'Stationery'
    elif contains_word(desc, ['toy', 'game', 'puzzle', 'ball', 'doll', 'playhouse']):
        return 'Toys'
    elif contains_word(desc, ['plant', 'garden', 'pot', 'shovel', 'watering']):
        return 'Garden'
    elif contains_word(desc, ['hanger', 'coat', 'rack', 'storage', 'box', 'drawer']):
        return 'Storage'
    elif contains_word(desc, ['soap', 'bath', 'towel', 'sponge', 'shampoo', 'lotion']):
        return 'Bath'
    elif contains_word(desc, ['frame', 'mirror', 'decorative', 'ornament', 'heart', 'sign', 'plaque']):
        return 'House Decoration'
    elif contains_word(desc, ['wrap', 'wrapping', 'gift wrap', 'roll', 'paper', 'tag', 'ribbon']):
        return 'Gift Wrapping'
    elif contains_word(desc, ['bag', 'gift bag', 'tote', 'jute']):
        return 'Gift Bags & Packaging'
    elif contains_word(desc, ['necklace', 'pendant', 'choker', 'bead', 'strand', 'bracelet', 'bangle', 'cuff', 'ring']):
        return 'Jewelryy'
    elif contains_word(desc, ['keyring', 'key ring', 'trinket']):
        return 'Keychains & Trinkets'
    elif contains_word(desc, ['christmas', 'tree', 'xmas', 'reindeer', 'snowflake']):
        return 'Christmas'
    elif contains_word(desc, ['party', 'balloon', 'bunting', 'confetti', 'birthday']):
        return 'Party Supplies'
    elif contains_word(desc, ['clock', 'wall art', 'print', 'poster']):
        return 'Wall Art & Clocks'
    else:
        return 'Other'


dataframe['Category'] = dataframe['Description'].apply(assign_category_keyword)

dataframe

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,Date,Month,Year,Hour,DayOfWeek,Category
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.30,2010-12-01,2010-12,2010,8,Wednesday,Lighting
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12-01,2010-12,2010,8,Wednesday,Lighting
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.00,2010-12-01,2010-12,2010,8,Wednesday,Storage
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12-01,2010-12,2010,8,Wednesday,Kitchen
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12-01,2010-12,2010,8,Wednesday,Textiles
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680,France,10.20,2011-12-09,2011-12,2011,12,Friday,Other
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680,France,12.60,2011-12-09,2011-12,2011,12,Friday,Other
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680,France,16.60,2011-12-09,2011-12,2011,12,Friday,Other
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680,France,16.60,2011-12-09,2011-12,2011,12,Friday,Other


In [51]:
dataframe.to_csv("dataframe_with_cat.csv", index=False)

# Personalized recommender 
### (Defining CustomerID x StockCode Matrix)

In [52]:
# Creating a pivot table for the customer-item matrix using Quantity
customer_item_matrix = dataframe.pivot_table(
    index='CustomerID',
    columns='StockCode',
    values='Quantity',
    aggfunc='sum',
    fill_value=0
)

# Display the shape of the customer-item matrix 
print(f"Customer-Item Matrix Shape: {customer_item_matrix.shape}")

# Display the first few rows of the customer-item matrix
customer_item_matrix.head()


Customer-Item Matrix Shape: (4339, 3665)


StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
12349,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Item based

### Computing similarites using Cosine, Euclidean and Manhattan

In [53]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

# Transpose the matrix to have products as rows
item_matrix = customer_item_matrix.T

# Calculate similarities
cosine_sim = cosine_similarity(item_matrix)
euclidean_sim = 1 / (1 + euclidean_distances(item_matrix))
manhattan_sim = 1 / (1 + manhattan_distances(item_matrix))

# Convert to DataFrames
cosine_df = pd.DataFrame(cosine_sim, index=item_matrix.index, columns=item_matrix.index)
euclidean_df = pd.DataFrame(euclidean_sim, index=item_matrix.index, columns=item_matrix.index)
manhattan_df = pd.DataFrame(manhattan_sim, index=item_matrix.index, columns=item_matrix.index)

In [54]:
def recommend_items_for_user(
    customer_id,
    customer_item_matrix,
    similarity_df,
    dataframe,
    top_n=5,
    unit_price_tolerance=10,
    filter_country=False,
    filter_time=False,
    filter_category=None
):
    if customer_id not in customer_item_matrix.index:
        return f"Customer {customer_id} not found."

    # Get the products the user has interacted with (i.e., bought)
    user_row = customer_item_matrix.loc[customer_id]
    purchased_items = user_row[user_row > 0].index.tolist()
    
    if not purchased_items:
        return f"Customer {customer_id} has no purchases."
    
    # Get user info for filtering
    user_country = dataframe[dataframe['CustomerID'] == customer_id]['Country'].iloc[0]
    user_avg_price = dataframe[(dataframe['CustomerID'] == customer_id) & (dataframe['Quantity'] > 0)]['UnitPrice'].mean()
    user_days = dataframe[(dataframe['CustomerID'] == customer_id) & (dataframe['Quantity'] > 0)]['DayOfWeek'].mode()

    # Dictionary to store aggregated scores for similar items
    scores = {}

    for item in purchased_items:
        # Get top similar items to this one
        similar_items = similarity_df[item].drop(index=item)
        
        for similar_item, score in similar_items.items():
            if similar_item in purchased_items:
                continue  # Skip items the user already bought
            scores[similar_item] = scores.get(similar_item, 0) + score  # Aggregate similarity scores

    # Sort scores in descending order
    ranked_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    # Filter recommendations by unit price, country, and optionally time
    recommended_items = []
    for item, score in ranked_items:
        item_df = dataframe[dataframe['StockCode'] == item]
        # Unit price filter
        item_avg_price = item_df['UnitPrice'].mean()
        if abs(item_avg_price - user_avg_price) > unit_price_tolerance:
            continue
        
        # Country filter
        if filter_country:
            item_countries = item_df['Country'].unique()
            if user_country not in item_countries:
                continue

        # Time filter (optional, e.g., recommend items bought at similar hour/day)
        if filter_time:
            item_days = item_df['DayOfWeek'].mode()
            if not (any(d in user_days.values for d in item_days.values)):
                continue

        # Category filter
        if filter_category is not None:
            item_categories = item_df['Category'].unique()
            if filter_category not in item_categories:
                continue


        # Get the description and stock code for the item
        description = item_df['Description'].iloc[0] if not item_df.empty else ""
        recommended_items.append({"StockCode": item, "Description": description})
        if len(recommended_items) == top_n:
            break

    return recommended_items


In [55]:
recommendations = recommend_items_for_user(
    customer_id='12347',
    customer_item_matrix=customer_item_matrix,
    similarity_df=cosine_df,  
    dataframe=dataframe,
    top_n=5,                  # number of recommendations
    unit_price_tolerance=10,   
    filter_country=False,   #too restrictive when set to True
    filter_time=True,
    filter_category='Kitchen'
)

print("Recommended items:", recommendations)

Recommended items: [{'StockCode': '21244', 'Description': 'BLUE POLKADOT PLATE '}, {'StockCode': '21240', 'Description': 'BLUE POLKADOT CUP'}, {'StockCode': '84817', 'Description': 'DANISH ROSE DECORATIVE PLATE'}, {'StockCode': '23183', 'Description': "MOTHER'S KITCHEN SPOON REST "}, {'StockCode': '22522', 'Description': 'CHILDS GARDEN FORK BLUE '}]


In [56]:
recommendations = recommend_items_for_user(
    customer_id='12347',
    customer_item_matrix=customer_item_matrix,
    similarity_df=euclidean_df, 
    dataframe=dataframe,
    top_n=5,                  
    unit_price_tolerance=10,
    filter_country=False,      
    filter_time=True,
    filter_category='House Decoration'
)

print("Recommended items:", recommendations)


Recommended items: [{'StockCode': '16207B', 'Description': 'PINK HEART RED HANDBAG'}, {'StockCode': '90003E', 'Description': 'GREEN PAIR HEART HAIR SLIDES'}, {'StockCode': '90197B', 'Description': 'BLACK GLASS BRACELET W HEART CHARMS'}, {'StockCode': '20820', 'Description': 'SILVER LOOKING MIRROR'}, {'StockCode': '21228', 'Description': 'POCKET MIRROR "GLAMOROUS"'}]


In [57]:
recommendations = recommend_items_for_user(
    customer_id='12347',
    customer_item_matrix=customer_item_matrix,
    similarity_df=manhattan_df, 
    dataframe=dataframe,
    top_n=5,                  
    unit_price_tolerance=10,  
    filter_country=False,      
    filter_time=True,
    filter_category='Lighting'
)

print("Recommended items:", recommendations)


Recommended items: [{'StockCode': '84499', 'Description': 'BLACK FLOWER CANDLE PLATE'}, {'StockCode': '85170A', 'Description': 'SET/6 IVORY BIRD T-LIGHT CANDLES'}, {'StockCode': '47016', 'Description': 'LIGHT DECORATION BATTERY OPERATED'}, {'StockCode': '79157B', 'Description': 'UBO-LIGHT TRIOBASE BLUE'}, {'StockCode': '79157V', 'Description': 'UBO-LIGHT TRIOBASE PURPLE'}]


### Evaluating model

In [58]:
import random
def train_test_split_per_user(matrix, test_size=0.2, seed=42):
    train = matrix.copy()
    test = pd.DataFrame(0, index=matrix.index, columns=matrix.columns)
    random.seed(seed)

    for user in matrix.index:
        purchased_items = matrix.loc[user]
        purchased_items = purchased_items[purchased_items > 0].index.tolist()

        if len(purchased_items) < 2:
            continue  # Skip users with too few purchases

        test_items = random.sample(purchased_items, max(1, int(len(purchased_items) * test_size)))

        for item in test_items:
            train.loc[user, item] = 0
            test.loc[user, item] = matrix.loc[user, item]

    return train, test


In [59]:
train_matrix, test_matrix = train_test_split_per_user(customer_item_matrix, test_size=0.2)

In [60]:
def recommend_from_train(customer_id, train_matrix, similarity_df, top_n=10):
    if customer_id not in train_matrix.index:
        return []

    user_row = train_matrix.loc[customer_id]
    purchased_items = user_row[user_row > 0].index.tolist()

    scores = {}

    for item in purchased_items:
        # Get similar items to the one the user bought
        similar_items = similarity_df[item].drop(index=item)

        for similar_item, score in similar_items.items():
            if similar_item in purchased_items:
                continue  # Skip items already purchased
            scores[similar_item] = scores.get(similar_item, 0) + score

    # Sort by aggregated similarity score
    ranked_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    return [item for item, _ in ranked_items[:top_n]]


In [None]:
from tqdm import tqdm

def precision_recall_at_k(train_matrix, test_matrix, similarity_df, k=5, max_users=100):
    precisions = []
    recalls = []
    users = list(test_matrix.index)[:max_users]

    for user in tqdm(users, desc="Evaluating Precision & Recall"):
        # Ground truth
        true_items = test_matrix.loc[user]
        true_items = set(true_items[true_items > 0].index)

        if not true_items:
            continue

        # Recommendations
        recommended = recommend_from_train(user, train_matrix, similarity_df, top_n=k)
        recommended_set = set(recommended)

        hits = recommended_set & true_items

        precision = len(hits) / k
        recall = len(hits) / len(true_items)

        precisions.append(precision)
        recalls.append(recall)

    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)

    return avg_precision, avg_recall



In [62]:
precision_cosine, recall_cosine = precision_recall_at_k(
    train_matrix, test_matrix, cosine_df, k=5, max_users=500
)

print(f"Cosine Similarity - Precision: {precision_cosine:.4f}, Recall: {recall_cosine:.4f}")


Evaluating Precision & Recall: 100%|██████████| 500/500 [04:54<00:00,  1.70it/s]

Cosine Similarity - Precision: 0.0859, Recall: 0.0672





In [63]:
precision_euclidean, recall_euclidean = precision_recall_at_k(train_matrix, test_matrix, euclidean_df, k=5, max_users=500)
print(f"Euclidean Similarity - Precision: {precision_euclidean:.4f}, Recall: {recall_euclidean:.4f}")


Evaluating Precision & Recall: 100%|██████████| 500/500 [04:21<00:00,  1.91it/s]

Euclidean Similarity - Precision: 0.0173, Recall: 0.0136





In [64]:
precision_manhattan, recall_manhattan = precision_recall_at_k(train_matrix, test_matrix, manhattan_df, k=5, max_users=500)
print(f"Manhattan Similarity - Precision: {precision_manhattan:.4f}, Recall: {recall_manhattan:.4f}")


Evaluating Precision & Recall: 100%|██████████| 500/500 [05:21<00:00,  1.56it/s]

Manhattan Similarity - Precision: 0.0077, Recall: 0.0069





## User based

### Computing similarites using Cosine, Euclidean and Manhattan

In [65]:
# Use the customer-item matrix where rows are users and columns are items
user_matrix = customer_item_matrix

# Compute cosine similarity between users
cosine_sim_users = cosine_similarity(user_matrix)
cosine_users_df = pd.DataFrame(cosine_sim_users, index=user_matrix.index, columns=user_matrix.index)

# Compute Euclidean similarity (1 / (1 + distance))
euclidean_sim_users = 1 / (1 + euclidean_distances(user_matrix))
euclidean_users_df = pd.DataFrame(euclidean_sim_users, index=user_matrix.index, columns=user_matrix.index)

# Compute Manhattan similarity (1 / (1 + distance))
manhattan_sim_users = 1 / (1 + manhattan_distances(user_matrix))
manhattan_users_df = pd.DataFrame(manhattan_sim_users, index=user_matrix.index, columns=user_matrix.index)


In [66]:
def recommend_user_based(
    customer_id,
    train_matrix,
    similarity_df,
    dataframe,
    top_k_neighbors=5,
    top_n_items=5,
    unit_price_tolerance=10,
    filter_country=False,
    filter_time=False,
    filter_category=None
):
    if customer_id not in train_matrix.index:
        return []

    # Get similarity scores for the target user
    similar_users = similarity_df[customer_id].drop(index=customer_id)
    top_users = similar_users.sort_values(ascending=False).head(top_k_neighbors).index

    # Filter similar users by country
    user_country = dataframe[dataframe['CustomerID'] == customer_id]['Country'].iloc[0]
    if filter_country:
        top_users = [u for u in top_users if dataframe[dataframe['CustomerID'] == u]['Country'].iloc[0] == user_country]

    # Filter by day of week only
    if filter_time:
        user_days = dataframe[(dataframe['CustomerID'] == customer_id) & (dataframe['Quantity'] > 0)]['DayOfWeek'].mode()
        filtered_users = []
        for u in top_users:
            u_days = dataframe[(dataframe['CustomerID'] == u) & (dataframe['Quantity'] > 0)]['DayOfWeek'].mode()
            if any(d in user_days.values for d in u_days.values):
                filtered_users.append(u)
        top_users = filtered_users

    # Filter by unit price similarity
    user_avg_price = dataframe[(dataframe['CustomerID'] == customer_id) & (dataframe['Quantity'] > 0)]['UnitPrice'].mean()
    if unit_price_tolerance is not None:
        filtered_users = []
        for u in top_users:
            u_avg_price = dataframe[(dataframe['CustomerID'] == u) & (dataframe['Quantity'] > 0)]['UnitPrice'].mean()
            if abs(u_avg_price - user_avg_price) <= unit_price_tolerance:
                filtered_users.append(u)
        top_users = filtered_users

    # Aggregate the purchases from top similar users
    if not top_users:
        return []
    similar_users_purchases = train_matrix.loc[top_users]
    user_purchase_vector = train_matrix.loc[customer_id]

    # Weighted sum of items (optionally use similarity scores as weights)
    item_scores = similar_users_purchases.sum(axis=0)

    # Remove items the user has already purchased
    items_already_bought = user_purchase_vector[user_purchase_vector > 0].index
    item_scores = item_scores.drop(labels=items_already_bought, errors='ignore')

    # Prepare recommendations with filters and output format
    recommended_items = []
    for item in item_scores.sort_values(ascending=False).index:
        item_df = dataframe[dataframe['StockCode'] == item]
        item_avg_price = item_df['UnitPrice'].mean()
        if abs(item_avg_price - user_avg_price) > unit_price_tolerance:
            continue
        if filter_country:
            item_countries = item_df['Country'].unique()
            if user_country not in item_countries:
                continue
        if filter_time:
            item_days = item_df['DayOfWeek'].mode()
            if not any(d in user_days.values for d in item_days.values):
                continue
        if filter_category is not None:
            item_categories = item_df['Category'].unique()
            if filter_category not in item_categories:
                continue
        description = item_df['Description'].iloc[0] if not item_df.empty else ""
        recommended_items.append({"StockCode": item, "Description": description})
        if len(recommended_items) == top_n_items:
            break

    return recommended_items

In [67]:
recommend_user_based(
    customer_id='12347',
    train_matrix=train_matrix,
    similarity_df=cosine_users_df,
    dataframe=dataframe,
    top_k_neighbors=5,
    top_n_items=5,
    unit_price_tolerance=10,
    filter_country=False,
    filter_time=False,
    filter_category='Kitchen'
)

[{'StockCode': '23419', 'Description': 'HOME SWEET HOME BOTTLE '},
 {'StockCode': '22441', 'Description': 'GROW YOUR OWN BASIL IN ENAMEL MUG'},
 {'StockCode': '37413', 'Description': 'ICON MUG REVOLUTIONARY'},
 {'StockCode': '37423', 'Description': 'WHITE WITH BLACK CATS PLATE'},
 {'StockCode': '37444A', 'Description': 'YELLOW BREAKFAST CUP AND SAUCER'}]

In [68]:
recommend_user_based(
    customer_id='12347',
    train_matrix=train_matrix,
    similarity_df=euclidean_users_df,
    dataframe=dataframe,
    top_k_neighbors=5,
    top_n_items=5,
    unit_price_tolerance=10,
    filter_country=False,
    filter_time=False,
    filter_category='House Decoration'
)

[{'StockCode': '22469', 'Description': 'HEART OF WICKER SMALL'},
 {'StockCode': '22297', 'Description': 'HEART IVORY TRELLIS SMALL'},
 {'StockCode': '84988', 'Description': 'SET OF 72 PINK HEART PAPER DOILIES'},
 {'StockCode': '70006', 'Description': 'LOVE HEART POCKET WARMER'},
 {'StockCode': '82482', 'Description': 'WOODEN PICTURE FRAME WHITE FINISH'}]

In [69]:
recommend_user_based(
    customer_id='12347',
    train_matrix=train_matrix,
    similarity_df=manhattan_users_df,
    dataframe=dataframe,
    top_k_neighbors=5,
    top_n_items=5,
    unit_price_tolerance=10,
    filter_country=False,
    filter_time=False,
    filter_category='Lighting'
)

[{'StockCode': '37491B', 'Description': 'BLUE/YELLOW CERAMIC CANDLE HOLDER'},
 {'StockCode': '37491C', 'Description': 'GREEN/BLUE CERAMIC CANDLE HOLDER'},
 {'StockCode': '37491D', 'Description': 'PURPLE/BLUE CERAMIC CANDLE HOLDER'},
 {'StockCode': '37495', 'Description': 'FAIRY CAKE BIRTHDAY CANDLE SET'},
 {'StockCode': '35966', 'Description': 'FOLKART CHRISTMAS TREE T-LIGHT HOLD'}]

**NOTE:** Cosine and Euclidean similarities produced almost identical recommendations for user `12347`, sharing items `22492`, `23077`, `22418`, and `22693`. In contrast, Manhattan similarity suggested a completely different set of items.


### Evaluating model

In [72]:
from tqdm import tqdm

def precision_recall_at_k_user_based(train_matrix, test_matrix, similarity_df, recommender_func, k=5, max_users=100):
    precisions = []
    recalls = []
    users = list(test_matrix.index)[:max_users]

    for user in tqdm(users, desc="Evaluating user-based Precision & Recall"):
        # Ground truth from test matrix
        true_items = test_matrix.loc[user]
        true_items = set(true_items[true_items > 0].index)

        if not true_items:
            continue

        # Get recommendations from user-based recommender
        recommended = recommender_func(
            customer_id=user,
            train_matrix=train_matrix,
            similarity_df=similarity_df,
            dataframe=dataframe,
            top_k_neighbors=5,
            top_n_items=k
        )

        recommended_set = set([(rec["StockCode"], rec["Description"]) for rec in recommended])
        hits = recommended_set & true_items

        precision = len(hits) / k
        recall = len(hits) / len(true_items)

        precisions.append(precision)
        recalls.append(recall)

    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)

    return avg_precision, avg_recall


In [73]:
precision_user_cosine, recall_user_cosine = precision_recall_at_k_user_based(
    train_matrix=train_matrix,
    test_matrix=test_matrix,
    similarity_df=cosine_users_df,
    recommender_func=recommend_user_based,
    k=5,
    max_users=4339
)

print(f"User-Based (Cosine) - Precision: {precision_user_cosine:.4f}, Recall: {recall_user_cosine:.4f}")


Evaluating user-based Precision & Recall: 100%|██████████| 4339/4339 [1:15:29<00:00,  1.04s/it]  

User-Based (Cosine) - Precision: 0.0000, Recall: 0.0000





In [74]:
precision_user_euclidean, recall_user_euclidean = precision_recall_at_k_user_based(
    train_matrix=train_matrix,
    test_matrix=test_matrix,
    similarity_df=euclidean_users_df,
    recommender_func=recommend_user_based,
    k=5,
    max_users=4339
)

print(f"User-Based (Euclidean) - Precision: {precision_user_euclidean:.4f}, Recall: {recall_user_euclidean:.4f}")


Evaluating user-based Precision & Recall: 100%|██████████| 4339/4339 [37:55<00:00,  1.91it/s]    

User-Based (Euclidean) - Precision: 0.0000, Recall: 0.0000





In [75]:
precision_user_manhattan, recall_user_manhattan = precision_recall_at_k_user_based(
    train_matrix=train_matrix,
    test_matrix=test_matrix,
    similarity_df=manhattan_users_df,
    recommender_func=recommend_user_based,
    k=5,
    max_users=2000
)

print(f"User-Based (Manhattan) - Precision: {precision_user_manhattan:.4f}, Recall: {recall_user_manhattan:.4f}")


Evaluating user-based Precision & Recall: 100%|██████████| 2000/2000 [12:23<00:00,  2.69it/s] 

User-Based (Manhattan) - Precision: 0.0000, Recall: 0.0000





## Matrix Factorization

In [117]:
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# Define the number of latent features
n_components = 50

# Fit the SVD model on the training matrix
svd = TruncatedSVD(n_components=n_components, random_state=42)
svd_matrix = svd.fit_transform(train_matrix)

# Reconstruct the approximated user-item matrix
reconstructed_matrix = np.dot(svd_matrix, svd.components_)

# Convert the result to a DataFrame (to match index/columns with train_matrix)
reconstructed_df = pd.DataFrame(
    reconstructed_matrix,
    index=train_matrix.index,
    columns=train_matrix.columns
)

# Function to compute RMSE using test data
def compute_rmse_svd(test_matrix, predicted_matrix):
    actuals = []
    predictions = []

    for user in test_matrix.index:
        for item in test_matrix.columns:
            true_val = test_matrix.loc[user, item]
            if true_val > 0:
                pred_val = predicted_matrix.loc[user, item]
                actuals.append(true_val)
                predictions.append(pred_val)

    return np.sqrt(mean_squared_error(actuals, predictions))

# Calculate RMSE
rmse_svd = compute_rmse_svd(test_matrix, reconstructed_df)
print(f"Truncated SVD - RMSE: {rmse_svd:.4f}")


Truncated SVD - RMSE: 98.8392


In [124]:
from tqdm import tqdm

def precision_recall_at_k_svd(train_matrix, test_matrix, predicted_matrix, k=5, max_users=100):
    precisions = []
    recalls = []
    users = list(test_matrix.index)[:max_users]

    for user in tqdm(users, desc="Evaluating Precision & Recall (SVD)"):
        # True purchased items in test set
        true_items = test_matrix.loc[user]
        true_items = set(true_items[true_items > 0].index)

        if not true_items:
            continue

        # Items the user has already seen in training
        known_items = set(train_matrix.loc[user][train_matrix.loc[user] > 0].index)

        user_predictions = predicted_matrix.loc[user].drop(labels=known_items, errors='ignore')
        recommended_items = user_predictions.sort_values(ascending=False).index

        # Country filter
        user_country = dataframe[dataframe['CustomerID'] == user]['Country'].iloc[0]
        filtered_items = []
        for item in recommended_items:
            item_countries = dataframe[dataframe['StockCode'] == item]['Country'].unique()
            if user_country in item_countries:
                filtered_items.append(item)
            if len(filtered_items) == k:
                break

        recommended_set = set(filtered_items)

        # Calculate hits
        hits = recommended_set & true_items

        # Precision and Recall
        precision = len(hits) / k
        recall = len(hits) / len(true_items)

        precisions.append(precision)
        recalls.append(recall)

    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)

    return avg_precision, avg_recall


In [125]:
precision_svd, recall_svd = precision_recall_at_k_svd(
    train_matrix=train_matrix,
    test_matrix=test_matrix,
    predicted_matrix=reconstructed_df,
    k=5,
    max_users=100
)

print(f"SVD - Precision: {precision_svd:.4f}, Recall: {recall_svd:.4f}")

Evaluating Precision & Recall (SVD): 100%|██████████| 100/100 [02:30<00:00,  1.50s/it]

SVD - Precision: 0.1354, Recall: 0.0898





# Saving results and dataframes for streamlit

In [126]:
import os
os.makedirs("outputs", exist_ok=True)

# Save matrices and important objects
customer_item_matrix.to_pickle("outputs/customer_item_matrix.pkl")
train_matrix = customer_item_matrix.copy()  # just to keep naming consistent
train_matrix.to_pickle("outputs/train_matrix.pkl")

# Save cosine similarity matrix (replace with cosine_users_df or cosine_items_df as needed)
cosine_df.to_pickle("outputs/cosine_df.pkl")

# Save the SVD (matrix factorization) predicted matrix
reconstructed_df.to_pickle("outputs/svd_predicted_matrix.pkl")
print("SVD (matrix factorization) predicted matrix saved successfully.")

print("All matrices saved successfully.")


SVD (matrix factorization) predicted matrix saved successfully.
All matrices saved successfully.
