In [4]:
# Importing required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [5]:
# Loading the merged dataset from Task 1
# Ensure the file "Transformed_Transactions.csv" is generated from Task 1 and is in the same directory
transactions_full = pd.read_csv(r"C:\Users\Abhay Thakur\Transformed_Transactions.csv")

# Displaying the first few rows of the dataset
print("Loaded Dataset:")
display(transactions_full.head())


Loaded Dataset:


Unnamed: 0,TransactionID,CustomerID,Region,Brand,ProductName,Category,Price,TransactionDate,Month,Hour
0,T00001,C0199,Europe,ComfortLiving,Bluetooth Speaker,Electronics,300.68,2024-08-25 12:38:23,8,12
1,T00112,C0146,Asia,ComfortLiving,Bluetooth Speaker,Electronics,300.68,2024-05-27 22:23:54,5,22
2,T00166,C0127,Europe,ComfortLiving,Bluetooth Speaker,Electronics,300.68,2024-04-25 07:38:55,4,7
3,T00272,C0087,South America,ComfortLiving,Bluetooth Speaker,Electronics,300.68,2024-03-26 22:55:37,3,22
4,T00363,C0070,Europe,ComfortLiving,Bluetooth Speaker,Electronics,300.68,2024-03-21 15:10:10,3,15


In [6]:
# Aggregating total spend and transaction count per customer
customer_profiles = transactions_full.groupby("CustomerID").agg(
    TotalSpend=("Price", "sum"),
    TransactionCount=("TransactionID", "count")
).reset_index()

# Adding category and brand preferences (proportions)
category_prefs = pd.crosstab(transactions_full["CustomerID"], transactions_full["Category"], normalize="index")
brand_prefs = pd.crosstab(transactions_full["CustomerID"], transactions_full["Brand"], normalize="index")

# Merging all features into the customer_profiles DataFrame
customer_profiles = customer_profiles.merge(category_prefs, on="CustomerID", how="left").merge(brand_prefs, on="CustomerID", how="left")

# Displaying the customer profiles
print("Customer Profiles:")
display(customer_profiles.head())


Customer Profiles:


Unnamed: 0,CustomerID,TotalSpend,TransactionCount,Books,Clothing,Electronics,Home Decor,ActiveWear,BookWorld,ComfortLiving,HomeSense,SoundWave,TechPro
0,C0001,1391.67,5,0.2,0.0,0.6,0.2,0.2,0.0,0.0,0.2,0.4,0.2
1,C0002,835.68,4,0.0,0.5,0.0,0.5,0.0,0.5,0.25,0.0,0.0,0.25
2,C0003,782.83,4,0.0,0.25,0.25,0.5,1.0,0.0,0.0,0.0,0.0,0.0
3,C0004,1925.09,8,0.375,0.0,0.25,0.375,0.25,0.25,0.0,0.0,0.125,0.375
4,C0005,874.81,3,0.0,0.0,0.666667,0.333333,0.333333,0.0,0.333333,0.0,0.0,0.333333


### Standardizing Features

In [7]:
# Standardizing customer profiles
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profiles.iloc[:, 1:])

# Converting back to a DataFrame for easier interpretation
scaled_profiles = pd.DataFrame(scaled_features, columns=customer_profiles.columns[1:], index=customer_profiles["CustomerID"])

print("Standardized Customer Profiles:")
display(scaled_profiles.head())


Standardized Customer Profiles:


Unnamed: 0_level_0,TotalSpend,TransactionCount,Books,Clothing,Electronics,Home Decor,ActiveWear,BookWorld,ComfortLiving,HomeSense,SoundWave,TechPro
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
C0001,0.033326,-0.011458,-0.28002,-1.034718,1.594843,-0.267357,-0.135865,-0.786758,-0.708574,0.217081,0.950822,0.23929
C0002,-0.806919,-0.467494,-1.211167,1.240781,-1.111686,0.947002,-1.114073,2.340954,0.693979,-0.789066,-1.025163,0.50983
C0003,-0.886789,-0.467494,-1.211167,0.103032,0.016034,0.947002,3.776968,-0.786758,-0.708574,-0.789066,-1.025163,-0.842869
C0004,0.839461,1.35665,0.534735,-1.034718,0.016034,0.441019,0.108687,0.777098,-0.708574,-0.789066,-0.407668,1.186179
C0005,-0.747783,-0.92353,-1.211167,-1.034718,1.895568,0.272358,0.516274,-0.786758,1.161497,-0.789066,-1.025163,0.960729


# Using Cosine Similarity

In [8]:
# Calculating cosine similarity
similarity_matrix = cosine_similarity(scaled_profiles)

# Converting to a DataFrame for easier interpretation
similarity_df = pd.DataFrame(similarity_matrix, index=scaled_profiles.index, columns=scaled_profiles.index)

print("Similarity Matrix:")
display(similarity_df.head())


Similarity Matrix:


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,-0.655,-0.067877,0.066749,0.367787,-0.031561,0.764425,0.116464,0.097756,-0.569246,...,0.526504,0.194549,0.12768,-0.178567,-0.242225,-0.425193,0.464581,-0.034653,0.110307,-0.041344
C0002,-0.655,1.0,-0.105578,0.000787,-0.005726,-0.417726,-0.258602,0.029372,-0.089395,0.478569,...,-0.86095,-0.040198,-0.293773,0.157996,0.299205,-0.057202,0.082989,0.146363,0.258441,-0.145457
C0003,-0.067877,-0.105578,1.0,-0.109678,0.32992,0.351197,0.317439,-0.287384,0.268314,0.065269,...,0.057355,0.016477,-0.449317,-0.308101,0.091443,0.792444,0.317844,-0.108604,0.277345,-0.021782
C0004,0.066749,0.000787,-0.109678,1.0,-0.05261,-0.14343,0.041823,0.605735,-0.766037,-0.294392,...,-0.010003,0.079292,0.25312,0.540868,-0.236786,-0.080026,-0.064504,-0.780963,-0.135891,-0.27664
C0005,0.367787,-0.005726,0.32992,-0.05261,1.0,-0.404569,0.730027,-0.2253,-0.071267,-0.301913,...,-0.152955,0.701666,-0.262201,0.275454,0.269941,0.144131,0.438867,0.359333,0.581979,-0.181042


## Extracting Top 3 Lookalikes

In [9]:
# Function to get top 3 lookalikes for each customer
def get_top_3_lookalikes(customer_id):
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:4]
    return list(zip(similar_customers.index, similar_customers.values))

# Creating a dictionary for the top 3 lookalikes for each customer
lookalike_dict = {
    customer: get_top_3_lookalikes(customer) for customer in similarity_df.index
}

# Converting the dictionary into a DataFrame
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_dict.keys(),
    "Lookalikes": [str(v) for v in lookalike_dict.values()]
})

# Displaying the lookalike DataFrame
print("Lookalike Recommendations:")
display(lookalike_df.head())


Lookalike Recommendations:


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0148', 0.8629284084221371), ('C0114', 0.84..."
1,C0002,"[('C0030', 0.7466866394242264), ('C0079', 0.72..."
2,C0003,"[('C0040', 0.8228670706943095), ('C0196', 0.79..."
3,C0004,"[('C0118', 0.7873773940557695), ('C0065', 0.77..."
4,C0005,"[('C0130', 0.7328481036258837), ('C0007', 0.73..."


In [10]:
# Saving the lookalike recommendations as a CSV file
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)
print("Lookalike recommendations saved as 'FirstName_LastName_Lookalike.csv'")

Lookalike recommendations saved as 'FirstName_LastName_Lookalike.csv'
