In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse


In [6]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merging the datasets
merged_data = transactions.merge(products, on='ProductID', how='inner')
merged_data = merged_data.merge(customers, on='CustomerID', how='inner')

print(merged_data.columns)


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'ProductName', 'Category',
       'Price_y', 'CustomerName', 'Region', 'SignupDate'],
      dtype='object')


In [7]:
# Aggregating transaction data to create customer profiles
customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price_x': 'mean',
    'ProductID': lambda x: list(x),
    'Category': lambda x: list(x)
}).reset_index()

# Replacing missing or NaN values in aggregated data
customer_profiles.fillna(0, inplace=True)


In [8]:
# Encoding 'ProductID' and 'Category' using CountVectorizer
product_vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False)
category_vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False)

# Transforming lists of products and categories into sparse matrices
product_encoded = product_vectorizer.fit_transform(customer_profiles['ProductID'])
category_encoded = category_vectorizer.fit_transform(customer_profiles['Category'])

# Combining encoded features with numerical features
numerical_features = customer_profiles[['TotalValue', 'Quantity', 'Price_x']]
scaler = StandardScaler()
numerical_scaled = scaler.fit_transform(numerical_features)

#final feature matrix
combined_features = scipy.sparse.hstack([numerical_scaled, product_encoded, category_encoded])




In [9]:
# Computing cosine similarity between all customers
similarity_matrix = cosine_similarity(combined_features)

# Function to get top N most similar customers for a given customer
def get_top_lookalikes(customer_id, similarity_matrix, customer_profiles, top_n=3):
    customer_index = customer_profiles[customer_profiles['CustomerID'] == customer_id].index[0]

    # Get similarity scores for this customer
    similarity_scores = similarity_matrix[customer_index]

    # Get indices of top N similar customers (excluding self)
    similar_indices = similarity_scores.argsort()[-(top_n + 1):][::-1][1:]  # Exclude self

    # Map indices to Customer IDs and scores
    top_lookalikes = [
        (customer_profiles.iloc[idx]['CustomerID'], similarity_scores[idx])
        for idx in similar_indices
    ]
    return top_lookalikes


In [10]:
# Generating lookalike data for the first 20 customers
lookalike_data = {}

for customer_id in customers['CustomerID'][:20]:  # First 20 customers
    lookalikes = get_top_lookalikes(customer_id, similarity_matrix, customer_profiles)
    lookalike_data[customer_id] = lookalikes

# Converting lookalike data into a structured format for saving
lookalike_df = pd.DataFrame([
    {'cust_id': cust_id, 'lookalikes': lookalike}
    for cust_id, lookalike in lookalike_data.items()
])

lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike.csv created successfully!")


Lookalike.csv created successfully!
