In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
# Load the datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

# Merge data to prepare customer profiles
transactions_customers = transactions.merge(customers, on="CustomerID", how="left")
transactions_full = transactions_customers.merge(products, on="ProductID", how="left")

# Create customer profiles
customer_profiles = transactions_full.groupby("CustomerID").agg({
    'ProductID': lambda x: list(x),
    'Category': lambda x: list(x),
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Region': 'first',
}).reset_index()

customer_profiles['UniqueCategories'] = customer_profiles['Category'].apply(lambda x: set(x))
customer_profiles['UniqueProducts'] = customer_profiles['ProductID'].apply(lambda x: set(x))

# Incorporate numeric features
customer_profiles['SpendingScaled'] = (customer_profiles['TotalValue'] - customer_profiles['TotalValue'].min()) / (
    customer_profiles['TotalValue'].max() - customer_profiles['TotalValue'].min())
customer_profiles['QuantityScaled'] = (customer_profiles['Quantity'] - customer_profiles['Quantity'].min()) / (
    customer_profiles['Quantity'].max() - customer_profiles['Quantity'].min())

# Prepare features for similarity calculation
customer_profiles['Features'] = customer_profiles.apply(
    lambda row: ' '.join(list(row['UniqueCategories']) + list(row['UniqueProducts'])), axis=1
)

# Vectorize features using TF-IDF
vectorizer = TfidfVectorizer()
feature_matrix = vectorizer.fit_transform(customer_profiles['Features'])

# Combine numeric features into similarity computation
numeric_features = customer_profiles[['SpendingScaled', 'QuantityScaled']].to_numpy()
combined_features = np.hstack([feature_matrix.toarray(), numeric_features])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(combined_features)

# Generate top 3 lookalikes for the first 20 customers
lookalike_results = {}

for idx in range(20):
    customer_id = customer_profiles.loc[idx, 'CustomerID']
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Exclude the customer themselves
    top_3_similar = similarity_scores[1:4]
    lookalike_results[customer_id] = [
        (customer_profiles.loc[i[0], 'CustomerID'], round(i[1], 2)) for i in top_3_similar
    ]

# Coverage
coverage = len({rec[0] for rec_list in lookalike_results.values() for rec in rec_list}) / len(customer_profiles)

# Print evaluation metrics
print("Coverage:", coverage)

# Save lookalike results to CSV
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_results.keys()),
    'Lookalikes': [str(lookalike_results[cust_id]) for cust_id in lookalike_results]
})
lookalike_df.to_csv('Lookalike.csv', index=False)


Coverage: 0.25125628140703515
