In [19]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder

LOAD DATASET

In [22]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [23]:
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

FEATURE ENGINEERING

In [13]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Category': lambda x: x.mode()[0],  # Most frequent category
    'Region': 'first',  # Assume region is consistent per customer
}).reset_index()

In [14]:
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_features[['Category', 'Region']]).toarray()

In [15]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[['TotalValue', 'Quantity']])

In [16]:
feature_matrix = pd.concat([
    pd.DataFrame(scaled_features, columns=['TotalValue', 'Quantity']),
    pd.DataFrame(encoded_features)
], axis=1)

In [17]:
similarity_matrix = cosine_similarity(feature_matrix)

Get top 3 lookalikes for the first 20 customers

In [18]:
lookalike_map = {}
for i in range(20):
    customer_id = customer_features.iloc[i]['CustomerID']
    similarity_scores = similarity_matrix[i]
    top_3_indices = similarity_scores.argsort()[-4:-1][::-1]  # Exclude self
    top_3 = [(customer_features.iloc[idx]['CustomerID'], similarity_scores[idx]) for idx in top_3_indices]
    lookalike_map[customer_id] = top_3

In [24]:
with open('Lookalike.csv', 'w') as f:
    for cust_id, lookalikes in lookalike_map.items():
        f.write(f"{cust_id}, {lookalikes}\n")

In [25]:
print("Lookalike.csv file has been created successfully!")

Lookalike.csv file has been created successfully!
