In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
customers_df = pd.read_csv("./Customers.csv")
transactions_df = pd.read_csv("./Transactions.csv")

In [6]:
transaction_aggregation = transactions_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',        
    'TransactionID': 'count',   
    'ProductID': 'nunique'     
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'TransactionID': 'TransactionCount',
    'ProductID': 'UniqueProducts'
}).reset_index()

customer_profiles = customers_df.merge(transaction_aggregation, on='CustomerID', how='left')

customer_profiles.fillna({'TotalSpending': 0, 'TransactionCount': 0, 'UniqueProducts': 0}, inplace=True)

In [10]:
scaler = StandardScaler()
numeric_features = ['TotalSpending', 'TransactionCount', 'UniqueProducts']
customer_profiles[numeric_features] = scaler.fit_transform(customer_profiles[numeric_features])

In [11]:
subset_profiles = customer_profiles[customer_profiles['CustomerID'].str.startswith('C00')][:20]

similarities = cosine_similarity(subset_profiles[numeric_features])

lookalikes = {}
for idx, customer_id in enumerate(subset_profiles['CustomerID']):
    similarity_scores = list(enumerate(similarities[idx]))
    top_lookalikes = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    lookalikes[customer_id] = [(subset_profiles.iloc[i]['CustomerID'], score) for i, score in top_lookalikes]

lookalike_data = [(customer_id, rec[0], rec[1]) for customer_id, recommendations in lookalikes.items() for rec in recommendations]
lookalike_df = pd.DataFrame(lookalike_data, columns=["CustomerID", "RecommendedCustomerID", "SimilarityScore"])

In [12]:
lookalike_csv_path = './Lookalike.csv'
lookalike_df.to_csv(lookalike_csv_path, index=False)
lookalike_csv_path

'./Lookalike.csv'