In [9]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers =pd.read_csv("/kaggle/input/zeotap/Customers.csv")
products=pd.read_csv("/kaggle/input/zeotap/Products.csv")
transactions=pd.read_csv("/kaggle/input/zeotap/Transactions.csv")

In [3]:
# Preview the datasets
print("Customers Dataset:")
customers.head()

Customers Dataset:


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
print("Products Dataset:")
products.head()

Products Dataset:


Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [5]:
print("Transactions Dataset:")
transactions.head()

Transactions Dataset:


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [6]:
# Merge transactions with products, avoiding duplicate columns
transactions = transactions.merge(products, on="ProductID", how="left", suffixes=('', '_product'))

# Merge transactions with customers, avoiding duplicate columns
data = transactions.merge(customers, on="CustomerID", how="left", suffixes=('', '_customer'))

# Preview the merged data
print("Merged Dataset:")
data.head()

Merged Dataset:


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,ProductName,Category,Price_product,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [7]:
# Handle missing values (if any)
data.fillna(0, inplace=True)

# Feature Engineering: Aggregate Transaction Data
customer_features = data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "TransactionID": "count",  # Number of transactions
    "Quantity": "sum",  # Total quantity purchased
    "Price": "mean",  # Average price of purchased products
    "Category": lambda x: x.mode()[0] if not x.mode().empty else "Unknown",  # Most purchased category
}).reset_index()

customer_features.rename(columns={
    "TotalValue": "TotalSpending",
    "TransactionID": "TransactionCount",
    "Quantity": "TotalQuantity",
    "Price": "AveragePrice",
    "Category": "TopCategory"
}, inplace=True)

# Merge with Customer Demographic Details
customer_profile = customers.merge(customer_features, on="CustomerID", how="left")

# Encode Categorical Variables
label_encoder = LabelEncoder()
customer_profile["RegionEncoded"] = label_encoder.fit_transform(customer_profile["Region"])
customer_profile["TopCategoryEncoded"] = label_encoder.fit_transform(customer_profile["TopCategory"])

# Drop redundant columns
customer_profile = customer_profile.drop(columns=["Region", "TopCategory", "CustomerName"])

# Normalize Numerical Features
scaler = MinMaxScaler()
numerical_columns = ["TotalSpending", "TransactionCount", "TotalQuantity", "AveragePrice"]
customer_profile[numerical_columns] = scaler.fit_transform(customer_profile[numerical_columns])

# Final Prepared Data
print("\nFinal Customer Profile Data:")
customer_profile.head()


Final Customer Profile Data:


Unnamed: 0,CustomerID,SignupDate,TotalSpending,TransactionCount,TotalQuantity,AveragePrice,RegionEncoded,TopCategoryEncoded
0,C0001,2022-07-10,0.308942,0.4,0.354839,0.519414,3,2
1,C0002,2022-02-13,0.168095,0.3,0.290323,0.367384,0,1
2,C0003,2024-03-07,0.249541,0.3,0.419355,0.338446,3,3
3,C0004,2022-10-09,0.497806,0.7,0.709677,0.436848,3,0
4,C0005,2022-08-15,0.184287,0.2,0.193548,0.548476,0,2


In [8]:
# Save the prepared data for further processing
customer_profile.to_csv("PreparedCustomerData.csv", index=False)

In [10]:
# Load the prepared customer profile data
customer_profile = pd.read_csv("PreparedCustomerData.csv")

# Feature Engineering: Create a feature matrix
# Select relevant features for similarity calculation
feature_columns = [
    "RegionEncoded",
    "TotalSpending",
    "TransactionCount",
    "TotalQuantity",
    "AveragePrice",
    "TopCategoryEncoded",
]

feature_matrix = customer_profile[feature_columns]

In [12]:
feature_matrix

Unnamed: 0,RegionEncoded,TotalSpending,TransactionCount,TotalQuantity,AveragePrice,TopCategoryEncoded
0,3,0.308942,0.4,0.354839,0.519414,2
1,0,0.168095,0.3,0.290323,0.367384,1
2,3,0.249541,0.3,0.419355,0.338446,3
3,3,0.497806,0.7,0.709677,0.436848,0
4,0,0.184287,0.2,0.193548,0.548476,2
...,...,...,...,...,...,...
195,1,0.462684,0.3,0.354839,0.823103,3
196,1,0.174318,0.2,0.258065,0.407106,2
197,1,0.080203,0.1,0.064516,0.434809,1
198,1,0.179098,0.3,0.258065,0.458693,2


In [13]:
feature_matrix.isnull().sum()

RegionEncoded         0
TotalSpending         1
TransactionCount      1
TotalQuantity         1
AveragePrice          1
TopCategoryEncoded    0
dtype: int64

In [16]:
# Handle missing values (if any)
feature_matrix.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_matrix.fillna(0, inplace=True)


In [17]:
# Calculate Cosine Similarity
similarity_matrix = cosine_similarity(feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile["CustomerID"], columns=customer_profile["CustomerID"])

In [19]:
# Function to get top N similar customers
def get_top_similar_customers(customer_id, n=3):
    # Sort similarity scores for the customer in descending order
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False)
    # Exclude the customer themselves (self-similarity)
    similar_customers = similar_customers[similar_customers.index != customer_id]
    # Get the top N similar customers
    top_similar = similar_customers.head(n)
    return [(cust_id, round(score, 4)) for cust_id, score in top_similar.items()]

In [20]:
# Generate recommendations for the first 20 customers (C0001 to C0020)
lookalike_data = {}
customer_ids = customer_profile["CustomerID"].tolist()

for customer_id in customer_ids[:20]:  # First 20 customers
    top_similars = get_top_similar_customers(customer_id, n=3)
    lookalike_data[customer_id] = top_similars

# Format the Lookalike.csv output
lookalike_output = []
for cust_id, similar_customers in lookalike_data.items():
    similar_customers_str = "; ".join([f"{sim_cust_id}:{score}" for sim_cust_id, score in similar_customers])
    lookalike_output.append({"cust_id": cust_id, "similar_customers": similar_customers_str})

lookalike_df = pd.DataFrame(lookalike_output)

In [24]:
lookalike_output

[{'cust_id': 'C0001',
  'similar_customers': 'C0181:0.9993; C0190:0.9992; C0048:0.9989'},
 {'cust_id': 'C0002',
  'similar_customers': 'C0088:0.9962; C0106:0.9915; C0162:0.9913'},
 {'cust_id': 'C0003',
  'similar_customers': 'C0031:0.9992; C0052:0.9992; C0076:0.9988'},
 {'cust_id': 'C0004',
  'similar_customers': 'C0087:0.9981; C0169:0.9971; C0165:0.9965'},
 {'cust_id': 'C0005',
  'similar_customers': 'C0186:0.9996; C0140:0.9984; C0007:0.998'},
 {'cust_id': 'C0006',
  'similar_customers': 'C0036:0.999; C0011:0.9989; C0171:0.9987'},
 {'cust_id': 'C0007',
  'similar_customers': 'C0115:0.9991; C0146:0.9988; C0186:0.9988'},
 {'cust_id': 'C0008',
  'similar_customers': 'C0065:0.9949; C0059:0.9937; C0160:0.9897'},
 {'cust_id': 'C0009',
  'similar_customers': 'C0061:0.9951; C0198:0.9947; C0050:0.9903'},
 {'cust_id': 'C0010',
  'similar_customers': 'C0068:0.9963; C0104:0.9942; C0111:0.9939'},
 {'cust_id': 'C0011',
  'similar_customers': 'C0137:0.9997; C0191:0.9995; C0126:0.9992'},
 {'cust_id':

In [23]:
# Preview the Lookalike recommendations
print("\nLookalike Recommendations:")
lookalike_df


Lookalike Recommendations:


Unnamed: 0,cust_id,similar_customers
0,C0001,C0181:0.9993; C0190:0.9992; C0048:0.9989
1,C0002,C0088:0.9962; C0106:0.9915; C0162:0.9913
2,C0003,C0031:0.9992; C0052:0.9992; C0076:0.9988
3,C0004,C0087:0.9981; C0169:0.9971; C0165:0.9965
4,C0005,C0186:0.9996; C0140:0.9984; C0007:0.998
5,C0006,C0036:0.999; C0011:0.9989; C0171:0.9987
6,C0007,C0115:0.9991; C0146:0.9988; C0186:0.9988
7,C0008,C0065:0.9949; C0059:0.9937; C0160:0.9897
8,C0009,C0061:0.9951; C0198:0.9947; C0050:0.9903
9,C0010,C0068:0.9963; C0104:0.9942; C0111:0.9939


In [22]:
# Save the output to Lookalike.csv
lookalike_df.to_csv("Lookalike.csv", index=False)