In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
# Step 2: Prepare the data (Drop the non-numeric columns from customers dataset and keep only the relevant ones)
customer_profile = customers[['CustomerID', 'Region', 'SignupDate']]

In [4]:
# Merge customer profile with transaction data
customer_transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')

In [5]:
# Aggregate the transaction data by customer
customer_data = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price': 'mean'
}).reset_index()

In [6]:
# Merge this with customer profile data
customer_profile = customer_profile.merge(customer_data, on='CustomerID', how='left')

In [7]:
# Step 3: Handle missing values - Drop any rows with NaN values
customer_profile = customer_profile.dropna(subset=['TotalValue', 'Quantity', 'Price'])

In [8]:
# Prepare the feature matrix X for similarity calculation (remove non-numeric columns)
X = customer_profile.drop(['CustomerID', 'Region', 'SignupDate'], axis=1)

In [9]:
# Step 4: Normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Step 5: Calculate Cosine Similarity
similarity_matrix = cosine_similarity(X_scaled)

In [11]:
# Step 6: Create the similarity dataframe
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile['CustomerID'], columns=customer_profile['CustomerID'])

In [13]:
# Step 7: Generate Lookalike recommendations for the first 20 customers (C0001 to C0020)
lookalikes = {}
for customer_id in customer_profile['CustomerID'][:20]:
    # Get the similarity scores for this customer
    similarities = similarity_df[customer_id].sort_values(ascending=False)
    
    # Exclude the customer itself from the recommendations
    similarities = similarities[similarities.index != customer_id]
    
    # Get the top 3 lookalikes
    top_lookalikes = similarities.head(3)
    
    # Store the lookalikes and their similarity scores
    lookalikes[customer_id] = [
        top_lookalikes.index[0], top_lookalikes.values[0],
        top_lookalikes.index[1], top_lookalikes.values[1],
        top_lookalikes.index[2], top_lookalikes.values[2]
    ]

In [14]:
# Step 8: Convert the lookalikes dictionary to a DataFrame
lookalikes_df = pd.DataFrame.from_dict(lookalikes, orient='index', columns=[
    'Lookalike 1', 'Similarity 1', 
    'Lookalike 2', 'Similarity 2', 
    'Lookalike 3', 'Similarity 3'
])

In [15]:
# Step 9: Save the lookalikes to a CSV file
lookalikes_df.to_csv('Lookalike.csv')

In [16]:
# Check the output
lookalikes_df.head(20)

Unnamed: 0,Lookalike 1,Similarity 1,Lookalike 2,Similarity 2,Lookalike 3,Similarity 3
C0001,C0103,0.997573,C0092,0.996879,C0135,0.992736
C0002,C0029,0.999854,C0077,0.996104,C0157,0.995478
C0003,C0111,0.998487,C0190,0.996656,C0038,0.990133
C0004,C0165,0.99839,C0162,0.998087,C0075,0.996932
C0005,C0167,0.999972,C0020,0.999714,C0128,0.998762
C0006,C0168,0.997612,C0196,0.995025,C0187,0.994752
C0007,C0125,0.999849,C0089,0.998344,C0085,0.996034
C0008,C0084,0.996087,C0113,0.995817,C0017,0.993173
C0009,C0130,0.999965,C0128,0.998596,C0192,0.998591
C0010,C0176,0.999451,C0055,0.993841,C0174,0.992744
