In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Display a quick overview
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [4]:
# Merge datasets to create a comprehensive table
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Feature engineering: Create customer-specific features
customer_summary = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',               # Total spend
    'ProductID': lambda x: x.nunique(), # Unique products purchased
    'TransactionDate': 'count'          # Total number of transactions
}).rename(columns={
    'TotalValue': 'TotalSpend',
    'ProductID': 'UniqueProducts',
    'TransactionDate': 'TransactionCount'
}).reset_index()

# Display the processed customer summary
print(customer_summary.head())


  CustomerID  TotalSpend  UniqueProducts  TransactionCount
0      C0001     3354.52               5                 5
1      C0002     1862.74               4                 4
2      C0003     2725.38               4                 4
3      C0004     5354.88               8                 8
4      C0005     2034.24               3                 3


In [5]:
# Normalize the numerical features using MinMaxScaler
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_summary[['TotalSpend', 'UniqueProducts', 'TransactionCount']])

# Save the normalized features as a DataFrame
normalized_data = pd.DataFrame(scaled_features, columns=['TotalSpend', 'UniqueProducts', 'TransactionCount'])
normalized_data['CustomerID'] = customer_summary['CustomerID']

# Display the normalized data
print(normalized_data.head())


   TotalSpend  UniqueProducts  TransactionCount CustomerID
0    0.308942        0.444444               0.4      C0001
1    0.168095        0.333333               0.3      C0002
2    0.249541        0.333333               0.3      C0003
3    0.497806        0.777778               0.7      C0004
4    0.184287        0.222222               0.2      C0005


In [6]:
# Calculate cosine similarity between all customers
similarity_matrix = cosine_similarity(scaled_features)

# Create a DataFrame for better visualization
similarity_df = pd.DataFrame(similarity_matrix, index=customer_summary['CustomerID'], columns=customer_summary['CustomerID'])

# Example: View the similarity scores for the first customer
print(similarity_df.iloc[0].sort_values(ascending=False).head(5))


CustomerID
C0001    1.000000
C0173    1.000000
C0137    0.999996
C0152    0.999992
C0027    0.999990
Name: C0001, dtype: float64


In [7]:
# Find the top 3 similar customers for each target customer (C0001 to C0020)
lookalikes = {}

for i, customer in enumerate(customer_summary['CustomerID'][:20]):  # First 20 customers
    # Get similarity scores for the current customer
    similarities = list(enumerate(similarity_matrix[i]))
    # Sort by similarity score (highest to lowest) and exclude the customer itself
    sorted_similarities = sorted(similarities, key=lambda x: -x[1])[1:4]
    # Store top 3 similar customers and their scores
    lookalikes[customer] = [(customer_summary['CustomerID'][idx], round(score, 3)) for idx, score in sorted_similarities]

# Display the lookalike recommendations for one customer
print(lookalikes['C0001'])  # Replace 'C0001' with any customer ID in the range


[('C0173', 1.0), ('C0137', 1.0), ('C0152', 1.0)]


In [9]:
# Prepare data for CSV export
lookalike_data = []

for customer, similar_customers in lookalikes.items():
    row = [customer]
    for similar_customer, score in similar_customers:
        row += [similar_customer, score]
    lookalike_data.append(row)

# Save to Lookalike.csv
lookalike_df = pd.DataFrame(lookalike_data, columns=[
    'CustomerID', 
    'Lookalike_1', 'Score_1', 
    'Lookalike_2', 'Score_2', 
    'Lookalike_3', 'Score_3'
])
lookalike_df.to_csv('Ayush_Shukla_Lookalike.csv', index=False)

print("Lookalike recommendations saved to Lookalike.csv")


Lookalike recommendations saved to Lookalike.csv
