In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [12]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')

In [4]:
transactions = pd.read_csv('Transactions.csv')

In [13]:
data = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

In [14]:
data = data.drop(columns=['Price_x'])

In [15]:
data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15
...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,SoundWave Smartwatch,Electronics,459.86,Jacob Holt,South America,2022-01-22
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,SoundWave Smartwatch,Electronics,459.86,Mrs. Kimberly Wright,North America,2024-04-07
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,SoundWave Smartwatch,Electronics,459.86,Tyler Haynes,North America,2024-09-21
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,SoundWave Smartwatch,Electronics,459.86,Joshua Hamilton,Asia,2024-11-11


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   ProductName      1000 non-null   object 
 7   Category         1000 non-null   object 
 8   CustomerName     1000 non-null   object 
 9   Region           1000 non-null   object 
 10  SignupDate       1000 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 86.1+ KB


In [17]:
#customer-level aggregated features
customer_features = data.groupby('CustomerID').agg({
    'Price_y': 'mean',
    'ProductID': 'count',
    'Category': lambda x: x.mode()[0],  # Most common category
}).reset_index()

In [21]:
customer_features.rename(columns={'Price_y': 'AvgPrice', 'ProductID': 'PurchaseCount', 'Category': 'FavCategory'}, inplace=True)

In [22]:
customer_features

Unnamed: 0,CustomerID,AvgPrice,PurchaseCount,FavCategory
0,C0001,278.334000,5,Electronics
1,C0002,208.920000,4,Clothing
2,C0003,195.707500,4,Home Decor
3,C0004,240.636250,8,Books
4,C0005,291.603333,3,Electronics
...,...,...,...,...
194,C0196,416.992500,4,Home Decor
195,C0197,227.056667,3,Electronics
196,C0198,239.705000,2,Clothing
197,C0199,250.610000,4,Electronics


In [23]:
# Normalize numeric features for similarity calculation
scaler = StandardScaler()
numeric_features = scaler.fit_transform(customer_features[['AvgPrice', 'PurchaseCount']])

In [25]:
# Encode categorical features if necessary
fav_category_encoded = pd.get_dummies(customer_features['FavCategory'])

# Combine all features into a single matrix
final_features = np.hstack([numeric_features, fav_category_encoded])

In [26]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(final_features)

In [28]:
# Get top 3 similar customers for each target customer
top_lookalikes = {}
for idx, cust_id in enumerate(customer_features['CustomerID'][:20]):  # First 20 customers
    similarities = list(enumerate(similarity_matrix[idx]))
    similarities = sorted(similarities, key=lambda x: -x[1])  # Sort by similarity score
    top_3 = [(customer_features['CustomerID'][i], score) for i, score in similarities[1:4]]  # Exclude self
    top_lookalikes[cust_id] = top_3

In [35]:
# Saving results
lookalike_df = pd.DataFrame([
    {'cust_id': cust_id, 'lookalikes': str(lookalikes)}
    for cust_id, lookalikes in top_lookalikes.items()
])

In [36]:
lookalike_df

Unnamed: 0,cust_id,lookalikes
0,C0001,"[('C0069', 0.998614632343058), ('C0148', 0.933..."
1,C0002,"[('C0029', 0.9999635291405795), ('C0176', 0.97..."
2,C0003,"[('C0025', 0.9970096716810369), ('C0178', 0.95..."
3,C0004,"[('C0161', 0.9999518639859284), ('C0017', 0.99..."
4,C0005,"[('C0120', 0.9979127157529742), ('C0140', 0.97..."
5,C0006,"[('C0125', 0.9889264199863582), ('C0085', 0.97..."
6,C0007,"[('C0115', 0.9807011082427939), ('C0050', 0.97..."
7,C0008,"[('C0065', 0.9677085242416361), ('C0113', 0.96..."
8,C0009,"[('C0061', 0.9977489797649394), ('C0092', 0.95..."
9,C0010,"[('C0134', 0.9732864835757826), ('C0111', 0.96..."


In [37]:
lookalike_df.to_csv('Alok_Kumar_Lookalike.csv', index=False)