In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load Data
# Load customer and transaction data
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

# Step 2: Feature Engineering
# Aggregate transaction data for each customer
customer_profiles = transactions.groupby("CustomerID").agg(
    TotalRevenue=("TotalValue", "sum"),
    TotalTransactions=("TransactionID", "count"),
    PurchaseFrequency=("TransactionDate", lambda x: len(pd.to_datetime(x).dt.date.unique()))
).reset_index()

# Merge customer demographics with transaction features
merged_data = pd.merge(customers, customer_profiles, on="CustomerID", how="inner")

# One-hot encode categorical features (Region)
categorical_features = ["Region"]  # PreferredCategory is not available
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical = encoder.fit_transform(merged_data[categorical_features])

# Normalize numerical features (TotalRevenue, TotalTransactions, PurchaseFrequency)
numerical_features = ["TotalRevenue", "TotalTransactions", "PurchaseFrequency"]
scaler = MinMaxScaler()
normalized_numerical = scaler.fit_transform(merged_data[numerical_features])

# Combine all features into a single matrix
feature_matrix = np.hstack([encoded_categorical, normalized_numerical])
customer_ids = merged_data["CustomerID"].values

# Step 3: Compute Similarity
# Calculate cosine similarity for the feature matrix
similarity_matrix = cosine_similarity(feature_matrix)

# Step 4: Generate Recommendations
# For each customer, find the top 3 most similar customers
lookalike_results = []
for idx, customer_id in enumerate(customer_ids):
    # Get similarity scores for the current customer
    similarity_scores = similarity_matrix[idx]

    # Get the indices of the top 3 similar customers (excluding itself)
    similar_indices = np.argsort(similarity_scores)[::-1][1:4]

    # Store the results
    lookalike_results.append({
        "CustomerID": customer_id,
        "Lookalike1": (customer_ids[similar_indices[0]], similarity_scores[similar_indices[0]]),
        "Lookalike2": (customer_ids[similar_indices[1]], similarity_scores[similar_indices[1]]),
        "Lookalike3": (customer_ids[similar_indices[2]], similarity_scores[similar_indices[2]])
    })

# Step 5: Save Lookalike Results to CSV
lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike recommendations saved to Lookalike.csv")

# Documentation:
# This script builds a lookalike model that recommends the top 3 similar customers for each customer in the dataset.
# The model uses both customer demographics and transaction history to compute cosine similarity between customers.
# Outputs are saved in 'Lookalike.csv' in the format: Map<CustomerID, List<(LookalikeID, SimilarityScore)>>
# Step 6: Display the First 20 Similar Profiles
# Merge customer names for easy reference
# Merge customer names for easy reference
# Step 6: Display the First 20 Similar Profiles
for result in lookalike_results[:20]:
    print(f"CustomerID: {result['CustomerID']}")
    print(f"  Lookalike 1: {result['Lookalike1'][0]} (Similarity: {result['Lookalike1'][1]:.2f})")
    print(f"  Lookalike 2: {result['Lookalike2'][0]} (Similarity: {result['Lookalike2'][1]:.2f})")
    print(f"  Lookalike 3: {result['Lookalike3'][0]} (Similarity: {result['Lookalike3'][1]:.2f})")
    print("-" * 50)



Lookalike recommendations saved to Lookalike.csv
CustomerID: C0001
  Lookalike 1: C0137 (Similarity: 1.00)
  Lookalike 2: C0152 (Similarity: 1.00)
  Lookalike 3: C0107 (Similarity: 1.00)
--------------------------------------------------
CustomerID: C0002
  Lookalike 1: C0142 (Similarity: 1.00)
  Lookalike 2: C0177 (Similarity: 1.00)
  Lookalike 3: C0027 (Similarity: 1.00)
--------------------------------------------------
CustomerID: C0003
  Lookalike 1: C0133 (Similarity: 1.00)
  Lookalike 2: C0052 (Similarity: 1.00)
  Lookalike 3: C0192 (Similarity: 1.00)
--------------------------------------------------
CustomerID: C0004
  Lookalike 1: C0113 (Similarity: 1.00)
  Lookalike 2: C0102 (Similarity: 1.00)
  Lookalike 3: C0104 (Similarity: 1.00)
--------------------------------------------------
CustomerID: C0005
  Lookalike 1: C0159 (Similarity: 1.00)
  Lookalike 2: C0186 (Similarity: 1.00)
  Lookalike 3: C0007 (Similarity: 1.00)
--------------------------------------------------
Custom