Import the libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


Load datasets

In [5]:
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')
customers_df = pd.read_csv('Customers.csv')


Merge datasets

In [6]:
transactions_merged = transactions_df.merge(products_df, on='ProductID')
transactions_merged = transactions_merged.merge(customers_df, on='CustomerID')

Aggregate transaction history by customer

In [7]:
customer_profiles = transactions_merged.groupby('CustomerID').agg({
'ProductName': lambda x: ' '.join(x),
'Category': lambda x: ' '.join(x),
'TotalValue': 'sum',
}).reset_index()

Combine textual and numerical data to create customer profiles

In [8]:
customer_profiles['CombinedFeatures'] = (
customer_profiles['ProductName'] + ' ' + customer_profiles['Category'] + ' ' + customer_profiles['TotalValue'].astype(str)
)


Use TF-IDF Vectorizer to encode textual features

In [9]:
vectorizer = TfidfVectorizer()
customer_tfidf = vectorizer.fit_transform(customer_profiles['CombinedFeatures'])

Compute similarity matrix

In [10]:
similarity_matrix = cosine_similarity(customer_tfidf)

Generate lookalike recommendations


In [13]:
lookalikes = {}

for idx, customer_id in enumerate(customer_profiles['CustomerID']):
# Get similarity scores for the current customer
  similarity_scores = list(enumerate(similarity_matrix[idx]))

# Sort by similarity score in descending order, excluding the customer itself
  similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
  similar_customers = [(customer_profiles.iloc[i]['CustomerID'], score) for i, score in similar_customers if i != idx]

# Select top 3 similar customers
  lookalikes[customer_id] = similar_customers[:3]

# Filter for the first 20 customers (C0001 to C0020)

lookalikes_filtered = {k: v for k, v in lookalikes.items() if k in customers_df['CustomerID'][:20].values}


Convert lookalikes to DataFrame and save as CSV

In [15]:
lookalike_data = []
for cust_id, similars in lookalikes_filtered.items():
    lookalike_data.append({
        'cust_id': cust_id,
        'lookalikes': [(similar_id, round(score, 3)) for similar_id, score in similars]
    })

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)