In [8]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [9]:
customers = pd.read_csv('./data/Customers.csv')
transactions = pd.read_csv('./data/Transactions.csv')
products = pd.read_csv('./data/Products.csv')

customer_transactions = pd.merge(transactions, customers, on='CustomerID').merge(products, on='ProductID')

In [10]:
customer_summary = customer_transactions.groupby(['CustomerID']).agg(
    total_spent=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'nunique'),
    avg_transaction_value=('TotalValue', 'mean')
).reset_index()

label_encoder = LabelEncoder()
customers['Region'] = label_encoder.fit_transform(customers['Region'])

customer_data = pd.merge(customer_summary, customers[['CustomerID', 'Region']], on='CustomerID')

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_data[['total_spent', 'total_transactions', 'avg_transaction_value', 'Region']])

similarity_matrix = cosine_similarity(scaled_features)

In [11]:
lookalikes = {}
for i in range(20): 
    customer_id = f'C{i+1:04d}'
    customer_index = customer_data[customer_data['CustomerID'] == customer_id].index[0]
    similar_customers = list(enumerate(similarity_matrix[customer_index]))
    similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)[1:4]
    
    lookalikes[customer_id] = [(customer_data.iloc[i[0]]['CustomerID'], i[1]) for i in similar_customers]

In [12]:
lookalike_df = pd.DataFrame([(key, val[0][0], val[0][1], val[1][0], val[1][1], val[2][0], val[2][1]) for key, val in lookalikes.items()],
                            columns=['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3'])

lookalike_df.to_csv('Lookalike.csv', index=False)