In [1]:
# Task 2: Lookalike Model for eCommerce Transactions Dataset

# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
# Merge datasets to create a complete view
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

In [4]:
# Feature Engineering: Create a customer-product matrix
customer_product_matrix = merged_data.pivot_table(
    index='CustomerID', 
    columns='ProductID', 
    values='Quantity', 
    aggfunc='sum', 
    fill_value=0
)

In [5]:
# Standardize the data
scaler = StandardScaler()
customer_product_matrix_scaled = scaler.fit_transform(customer_product_matrix)

In [6]:
# Compute Cosine Similarity
similarity_matrix = cosine_similarity(customer_product_matrix_scaled)
similarity_df = pd.DataFrame(similarity_matrix, 
                             index=customer_product_matrix.index, 
                             columns=customer_product_matrix.index)

In [7]:
# Function to get top 3 similar customers
def get_top_3_similar(customers_df, customer_id):
    similar_customers = customers_df[customer_id].sort_values(ascending=False)[1:4]
    return similar_customers

In [8]:
# Generate Lookalike CSV for the first 20 customers
lookalike_data = {}
for customer_id in customers['CustomerID'][:20]:
    similar_customers = get_top_3_similar(similarity_df, customer_id)
    lookalike_data[customer_id] = list(zip(similar_customers.index, similar_customers.values))

In [9]:
# Save Lookalike Data
lookalike_df = pd.DataFrame.from_dict(lookalike_data, orient='index', columns=['SimilarCustomer1', 'SimilarCustomer2', 'SimilarCustomer3'])
lookalike_df.to_csv("Lookalike.csv", index_label="CustomerID")

In [10]:
# Display a sample of the lookalike data
print("\nSample Lookalike Data:")
print(lookalike_df.head())


Sample Lookalike Data:
                   SimilarCustomer1              SimilarCustomer2  \
C0001  (C0194, 0.40492753118932323)   (C0104, 0.3740015051203954)   
C0002  (C0030, 0.40461685378594076)  (C0091, 0.38377803020909534)   
C0003   (C0181, 0.4775717980039305)  (C0134, 0.47101615387800955)   
C0004   (C0070, 0.3519014889798192)   (C0175, 0.3160978979266089)   
C0005  (C0096, 0.48745613929263704)  (C0023, 0.47025182492905515)   

                   SimilarCustomer3  
C0001  (C0020, 0.36660865634533374)  
C0002  (C0071, 0.32015798105808685)  
C0003   (C0144, 0.4237999071645031)  
C0004  (C0132, 0.27959855424498353)  
C0005   (C0055, 0.3820996241448556)  
