In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load datasets
customers = pd.read_csv(r"C:\Users\VIVEK KUMAR SINGH\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\VIVEK KUMAR SINGH\Downloads\Products.csv")
transactions = pd.read_csv(r'C:\Users\VIVEK KUMAR SINGH\Downloads\Transactions.csv')

# Merge the datasets to get product categories along with transaction details
merged_data = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

# Feature Engineering: Aggregate transaction data at the customer level
customer_data = merged_data.groupby('CustomerID').agg(
    total_transactions=('TransactionID', 'count'),
    total_spend=('TotalValue', 'sum'),
    most_frequent_category=('Category', lambda x: x.mode()[0])
).reset_index()

# Merge customer data with their demographic information
customer_data = customer_data.merge(customers[['CustomerID', 'Region']], on='CustomerID')

# Convert categorical features (Region, most_frequent_category) into numerical format
customer_data['Region'] = customer_data['Region'].astype('category').cat.codes
customer_data['MostFrequentCategory'] = customer_data['most_frequent_category'].astype('category').cat.codes

# Customer profile is a combination of total spend, total transactions, region, and most frequent category
customer_data['Profile'] = customer_data[['total_spend', 'total_transactions', 'Region', 'MostFrequentCategory']].values.tolist()

# Extract customer profiles (features)
profiles = np.array(customer_data['Profile'].tolist())

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(profiles)

# The similarity matrix is symmetric, so we only need the upper triangle
np.fill_diagonal(similarity_matrix, 0)  # Set diagonal to 0 to ignore self-similarity

# Create a dictionary to store lookalikes and their similarity scores
lookalikes = {}

# Get the top 3 lookalikes for customers C0001 to C0020
for i, customer_id in enumerate(customer_data['CustomerID']):
    if i < 20:  # We want the top 20 customers
        # Get similarity scores for this customer
        similarity_scores = similarity_matrix[i]
        
        # Sort the customers by similarity score (highest first)
        similar_customer_indices = np.argsort(similarity_scores)[::-1][:3]  # Top 3
        
        # Get the customer IDs and their similarity scores
        similar_customers = customer_data['CustomerID'].iloc[similar_customer_indices].values
        scores = similarity_scores[similar_customer_indices]
        
        # Store the results in the dictionary
        lookalikes[customer_id] = list(zip(similar_customers, scores))

# Create a list to store the formatted data for the Lookalikes.csv file
lookalike_data = []

# Prepare the Lookalikes CSV data
for customer_id, recommendations in lookalikes.items():
    row = [customer_id]
    
    # Add lookalikes and their scores to the row
    for i, (similar_customer, score) in enumerate(recommendations):
        row.append(similar_customer)
        row.append(score)
    
    # Ensure that there are exactly 3 lookalikes (fill with None if not available)
    while len(row) < 7:
        row.append(None)
    
    lookalike_data.append(row)

# Convert the lookalike data into a DataFrame
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'Lookalike_1_ID', 'Lookalike_1_Score',
                                                    'Lookalike_2_ID', 'Lookalike_2_Score',
                                                    'Lookalike_3_ID', 'Lookalike_3_Score'])

# Save the Lookalikes data to a CSV file
lookalike_df.to_csv(r"C:\Users\VIVEK KUMAR SINGH\Downloads\Ayushi_Singh_Lookalike.csv", index=False)

print("Lookalikes CSV has been successfully created!")

Lookalikes CSV has been successfully created!
