In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample customer data
customers = pd.read_csv("E:\\Zeotap Assignment\\Customers.csv")

# Sample transaction data
transactions = pd.read_csv("E:\\Zeotap Assignment\\Transactions - Transactions.csv")


# Sample product data
products = pd.read_csv("E:\\Zeotap Assignment\\Products.csv")


In [2]:
# Merge transaction data with product data to get product category
transactions = pd.merge(transactions, products[['ProductID', 'Category', 'ProductName']], on='ProductID', how='left')

# Group transactions by CustomerID and aggregate based on product category
customer_features = transactions.groupby('CustomerID').agg(
    #total_spent=('TotalValue', 'sum'),
    #total_items=('Quantity', 'sum'),
    books=('Category', lambda x: (x == 'Books').sum()),
    electronics=('Category', lambda x: (x == 'Electronics').sum()),
    home_decor=('Category', lambda x: (x == 'Home Decor').sum()),
    clothing=('Category', lambda x: (x == 'Clothing').sum()),
    products_bought=('ProductName', lambda x: ' '.join(x))
).reset_index()

# Merge the customer demographic data (including Region) with the transaction data features
customer_features = pd.merge(customer_features, customers[['CustomerID', 'Region']], on='CustomerID', how='left')

In [3]:
transactions.head(20)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,Category,ProductName
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Electronics,ComfortLiving Bluetooth Speaker
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Electronics,ComfortLiving Bluetooth Speaker
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Electronics,ComfortLiving Bluetooth Speaker
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Electronics,ComfortLiving Bluetooth Speaker
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Electronics,ComfortLiving Bluetooth Speaker
5,T00442,C0188,P067,2024-12-26 14:40:03,1,300.68,300.68,Electronics,ComfortLiving Bluetooth Speaker
6,T00490,C0195,P067,2024-11-24 11:49:48,3,902.04,300.68,Electronics,ComfortLiving Bluetooth Speaker
7,T00536,C0008,P067,2024-09-22 06:13:59,1,300.68,300.68,Electronics,ComfortLiving Bluetooth Speaker
8,T00564,C0157,P067,2024-12-07 17:57:40,3,902.04,300.68,Electronics,ComfortLiving Bluetooth Speaker
9,T00631,C0130,P067,2024-05-14 23:14:59,2,601.36,300.68,Electronics,ComfortLiving Bluetooth Speaker


In [4]:
customer_features

Unnamed: 0,CustomerID,books,electronics,home_decor,clothing,products_bought,Region
0,C0001,1,3,1,0,SoundWave Cookbook HomeSense Wall Art SoundWav...,South America
1,C0002,0,0,2,2,BookWorld Cookware Set BookWorld Rug ComfortLi...,Asia
2,C0003,0,1,2,1,ActiveWear Cookware Set ActiveWear Rug ActiveW...,South America
3,C0004,3,2,3,0,TechPro Textbook TechPro Rug TechPro Vase Acti...,South America
4,C0005,0,2,1,0,ActiveWear Cookware Set TechPro Smartwatch Com...,Asia
...,...,...,...,...,...,...,...
194,C0196,1,0,2,1,ComfortLiving Mystery Book ActiveWear Jacket A...,Europe
195,C0197,0,2,1,0,ActiveWear Wall Art BookWorld Smartwatch Sound...,Europe
196,C0198,0,1,0,1,ComfortLiving Laptop HomeSense Running Shoes,Europe
197,C0199,0,2,2,0,ComfortLiving Bluetooth Speaker HomeSense Wall...,Europe


In [5]:
# One-hot encoding using pd.get_dummies
#customer_features = pd.get_dummies(customer_features, columns=['Region'], prefix=['Reg'])

In [6]:
#customer_features

In [5]:
# Create a new column where all the features are joined into a single string
customer_features['features_string'] = customer_features.apply(lambda row: ' '.join(row.drop('CustomerID').astype(str)), axis=1)

In [6]:
customer_features['features_string'][0]

'1 3 1 0 SoundWave Cookbook HomeSense Wall Art SoundWave Headphones ActiveWear Smartwatch TechPro Headphones South America'

In [7]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the sentences (features string)
features_matrix = vectorizer.fit_transform(customer_features['features_string'])

# Calculate cosine similarity between all customers
similarity_matrix = cosine_similarity(features_matrix)

# Convert the similarity matrix into a DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Display the similarity between customers
#print(similarity_df)

In [13]:

# Load the customer data CSV
customer_data = pd.read_csv('E:\\Zeotap Assignment\\Customers.csv')  # Adjust the path if necessary

# Fetch the first 20 CustomerIDs (C0001 to C0020)
customer_ids_to_process = customer_data['CustomerID'].head(20)

# Create an empty list to store the results
lookalike_results = []

# Loop through each customer ID and fetch their top 3 similar customers
for customer_id in customer_ids_to_process:
    # Get the similarities for the current customer
    customer_similarities = similarity_df[customer_id]

    # Sort customers by similarity in descending order and exclude the self-comparison
    similar_customers = customer_similarities.drop(customer_id).sort_values(ascending=False)

    # Get the top 3 most similar customers
    top_similar_customers = similar_customers.head(3)

    # Create a list of tuples (similar_customer_id, similarity_score)
    lookalike_list = [(similar_customer_id, score) for similar_customer_id, score in top_similar_customers.items()]

    # Add the current customer and their lookalikes to the results list
    lookalike_results.append([customer_id, lookalike_list])

# Convert the results into a DataFrame
lookalike_df = pd.DataFrame(lookalike_results, columns=['CustomerID', 'Lookalikes'])

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Top similar customers for the first 20 IDs have been saved to 'Lookalike.csv'.")


Top similar customers for the first 20 IDs have been saved to 'Lookalike.csv'.


In [12]:
customer_id = 'C0001'


# Get the similarities for the given customer
customer_similarities = similarity_df[customer_id]

# Sort customers by similarity in descending order and exclude the self-comparison
similar_customers = customer_similarities.drop(customer_id).sort_values(ascending=False)

# Get the top N most similar customers
top_similar_customers = similar_customers.head(10)

top_similar_customers

CustomerID
C0129    0.719034
C0197    0.703285
C0026    0.685701
C0100    0.640022
C0068    0.638495
C0050    0.626471
C0104    0.617585
C0025    0.609847
C0179    0.608005
C0059    0.602276
Name: C0001, dtype: float64