In [17]:
# Import libraries
import pandas as pd

# Load datasets
customers = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Project\data\Customers.csv")
products = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Project\data\Products.csv")
transactions = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Project\data\Transactions.csv")

# Display the first few rows of each dataset
print("Customers Dataset:")
display(customers.head())

print("Products Dataset:")
display(products.head())

print("Transactions Dataset:")
display(transactions.head())

Customers Dataset:


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


Products Dataset:


Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


Transactions Dataset:


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [18]:
# Merge customers with transactions based on CustomerID
customer_transactions = pd.merge(transactions, customers, on='CustomerID')

# Aggregate transaction data per customer
customer_behavior = customer_transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),        # Total spending per customer
    transaction_count=('TransactionID', 'count'),  # Number of transactions
    unique_products=('ProductID', 'nunique')   # Number of unique products purchased
).reset_index()

# Merge the behavior data with the customers' region information
customer_behavior = pd.merge(customer_behavior, customers[['CustomerID', 'Region']], on='CustomerID')

# Display the aggregated customer behavior data
display(customer_behavior.head())


Unnamed: 0,CustomerID,total_spent,transaction_count,unique_products,Region
0,C0001,3354.52,5,5,South America
1,C0002,1862.74,4,4,Asia
2,C0003,2725.38,4,4,South America
3,C0004,5354.88,8,8,South America
4,C0005,2034.24,3,3,Asia


In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Extract features for similarity calculation
features = customer_behavior[['total_spent', 'transaction_count', 'unique_products']]

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(scaled_features)

# Display the similarity matrix for the first 20 customers
similarity_df = pd.DataFrame(similarity_matrix[:20, :20], index=customer_behavior['CustomerID'][:20], columns=customer_behavior['CustomerID'][:20])
display(similarity_df)


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,C0011,C0012,C0013,C0014,C0015,C0016,C0017,C0018,C0019,C0020
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
C0001,1.0,0.44115,0.152203,-0.030667,0.115536,-0.684486,-0.040062,0.253231,0.342191,0.469227,-0.508664,-0.382608,-0.295809,0.139157,0.134605,-0.605349,0.086526,-0.721185,0.817154,0.061158
C0002,0.44115,1.0,0.93017,-0.88894,0.920482,0.040945,0.832297,-0.683623,0.992784,0.999182,-0.860726,-0.970613,-0.987201,0.934215,0.931281,-0.855334,-0.817822,-0.830148,0.174338,0.896983
C0003,0.152203,0.93017,1.0,-0.992468,0.999259,0.404906,0.977338,-0.902205,0.966632,0.914561,-0.647877,-0.971106,-0.970954,0.998982,0.999243,-0.619522,-0.969703,-0.568763,-0.197789,0.99578
C0004,-0.030667,-0.88894,-0.992468,1.0,-0.996377,-0.487096,-0.99206,0.941292,-0.937436,-0.869969,0.600775,0.93456,0.947827,-0.993609,-0.994323,0.559558,0.990476,0.492956,0.294531,-0.999325
C0005,0.115536,0.920482,0.999259,-0.996377,1.0,0.427019,0.982706,-0.914305,0.960359,0.903969,-0.638871,-0.961359,-0.966687,0.999185,0.999527,-0.606242,-0.977108,-0.55008,-0.224273,0.998346
C0006,-0.684486,0.040945,0.404906,-0.487096,0.427019,1.0,0.58699,-0.752794,0.158148,0.000501,0.380461,-0.225544,-0.183817,0.392061,0.40008,0.444203,-0.602262,0.51962,-0.972495,0.476078
C0007,-0.040062,0.832297,0.977338,-0.99206,0.982706,0.58699,1.0,-0.973011,0.89213,0.809235,-0.495683,-0.905778,-0.903324,0.974897,0.976819,-0.452846,-0.998425,-0.386337,-0.400423,0.991365
C0008,0.253231,-0.683623,-0.902205,0.941292,-0.914305,-0.752794,-0.973011,1.0,-0.765879,-0.653725,0.310141,0.786174,0.784562,-0.898975,-0.902523,0.252516,0.978794,0.170558,0.599368,-0.935825
C0009,0.342191,0.992784,0.966632,-0.937436,0.960359,0.158148,0.89213,-0.765879,1.0,0.987216,-0.812473,-0.981893,-0.998587,0.970126,0.968066,-0.797356,-0.880843,-0.760457,0.055641,0.943287
C0010,0.469227,0.999182,0.914561,-0.869969,0.903969,0.000501,0.809235,-0.653725,0.987216,1.0,-0.876848,-0.962298,-0.980589,0.919129,0.915868,-0.874032,-0.79413,-0.851878,0.213849,0.878465


In [20]:
import os

# Check if the 'outputs' directory exists, if not, create it
if not os.path.exists('outputs'):
    os.makedirs('outputs')


In [21]:
# Get the top 3 lookalike customers for the first 20 customers
lookalikes = {}

for i, customer_id in enumerate(customer_behavior['CustomerID'][:20]):
    # Get similarity scores for the current customer
    sim_scores = similarity_matrix[i]
    
    # Get the top 3 similar customers (excluding the customer itself)
    similar_customer_indices = sim_scores.argsort()[-4:-1]  # Get top 3 excluding the customer itself
    similar_customers = customer_behavior['CustomerID'].iloc[similar_customer_indices]
    scores = sim_scores[similar_customer_indices]
    
    # Map customer to the top 3 similar customers with their similarity scores
    lookalikes[customer_id] = list(zip(similar_customers, scores))

# Create a DataFrame for lookalikes and save it to CSV
lookalike_df = pd.DataFrame([(key, *value) for key, values in lookalikes.items() for value in values], columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save the lookalike model result to a CSV file
lookalike_df.to_csv('outputs/Anvita_Magarde_Lookalike.csv', index=False)

# Display the lookalike DataFrame
display(lookalike_df.head())


Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0056,0.930427
1,C0001,C0152,0.986905
2,C0001,C0137,0.996332
3,C0002,C0010,0.999182
4,C0002,C0199,0.999347
