In [1]:
#Step 1: Load and Preprocess Data
#We need both customer and transaction data to create meaningful customer profiles.
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load data
customers = pd.read_csv('/kaggle/input/zeotap-pdataset/Data.csv/Customers.csv')
products = pd.read_csv('/kaggle/input/zeotap-pdataset/Data.csv/Products.csv')
transactions = pd.read_csv('/kaggle/input/zeotap-pdataset/Data.csv/Transactions.csv')

# Merge transaction and product data to get product information in transactions
transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')

# We will focus on the profile and transaction history for each customer


In [2]:
#Step 2: Feature Engineering for Customer Profiles


# Aggregating transaction data by customer
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',        # Total spending per customer
    'Quantity': 'sum',          # Total number of products bought
    'ProductID': 'nunique',     # Number of unique products bought
}).reset_index()

# Aggregating by product category
category_sales = transactions.groupby(['CustomerID', 'Category'])['TotalValue'].sum().unstack(fill_value=0)

# Merging customer profile data with transaction history
customer_profile = customers.set_index('CustomerID').join(customer_transactions.set_index('CustomerID')).join(category_sales, on='CustomerID', how='left').reset_index()

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['TotalValue', 'Quantity', 'ProductID']
customer_profile[numerical_features] = scaler.fit_transform(customer_profile[numerical_features])


In [4]:
#Step 3: Build Lookalike Model using Cosine Similarity
#Now that we have the customer profiles, we can calculate the similarity between customers. Cosine similarity is commonly used to measure the similarity between vectors in this case.

# Fill NaN values with 0 in the customer profile
customer_profile_filled = customer_profile.fillna(0)

# Now calculate the cosine similarity
profile_columns = ['TotalValue', 'Quantity', 'ProductID'] + [col for col in category_sales.columns]  # Include category columns
customer_vectors = customer_profile_filled[profile_columns].values
similarity_matrix = cosine_similarity(customer_vectors)

# Get the top 3 lookalikes for each customer
def get_top_3_lookalikes(similarity_matrix, customer_index):
    similarity_scores = similarity_matrix[customer_index]
    sorted_indices = similarity_scores.argsort()[::-1][1:4]  # Top 3 excluding self
    return [(customer_profile_filled.iloc[i]['CustomerID'], similarity_scores[i]) for i in sorted_indices]

lookalike_results = {}
for customer_index in range(20):  # For the first 20 customers (C0001 to C0020)
    lookalike_results[customer_profile_filled.iloc[customer_index]['CustomerID']] = get_top_3_lookalikes(similarity_matrix, customer_index)

# Create a DataFrame for lookalike results
lookalike_data = []
for cust_id, lookalikes in lookalike_results.items():
    for lookalike, score in lookalikes:
        lookalike_data.append({'CustomerID': cust_id, 'LookalikeID': lookalike, 'SimilarityScore': score})

lookalike_df = pd.DataFrame(lookalike_data)

# Save results to a CSV
lookalike_df.to_csv('Abhishek_Brahmbhatt_Lookalike.csv', index=False)
