In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from datetime import datetime

In [23]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")

In [24]:
customer_transactions = pd.merge(transactions, customers, on='CustomerID', how='left')
customer_transactions = pd.merge(customer_transactions, products, on='ProductID', how='left')

In [25]:
customer_summary = customer_transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    average_transaction_value=('TotalValue', 'mean')
).reset_index()

In [26]:
customer_summary = pd.merge(customer_summary, customers[['CustomerID', 'Region', 'SignupDate']], on='CustomerID', how='left')

In [27]:
customer_summary['SignupDate'] = pd.to_datetime(customer_summary['SignupDate'])
customer_summary['days_since_signup'] = (datetime.now() - customer_summary['SignupDate']).dt.days

In [28]:
scaler = StandardScaler()
features = ['total_spent', 'transaction_count', 'average_transaction_value', 'days_since_signup']
customer_summary[features] = scaler.fit_transform(customer_summary[features])

In [29]:
scaler = StandardScaler()
customer_summary[['total_spent', 'transaction_count', 'average_transaction_value', 'days_since_signup']] = \
    scaler.fit_transform(customer_summary[['total_spent', 'transaction_count', 'average_transaction_value', 'days_since_signup']])

In [30]:
customer_pairs = []

for i in range(len(customer_summary)):
    for j in range(i + 1, len(customer_summary)):  # Pair each customer with every other customer
        customer_pairs.append((customer_summary.iloc[i], customer_summary.iloc[j]))

In [31]:
pairs_data = []
for pair in customer_pairs:
    customer1, customer2 = pair
    similarity_score = cosine_similarity([customer1[features].values], [customer2[features].values])[0][0]
    pairs_data.append({
        'CustomerID_1': customer1['CustomerID'],
        'CustomerID_2': customer2['CustomerID'],
        'TotalSpent_1': customer1['total_spent'],
        'TotalSpent_2': customer2['total_spent'],
        'TransactionCount_1': customer1['transaction_count'],
        'TransactionCount_2': customer2['transaction_count'],
        'AvgTransactionValue_1': customer1['average_transaction_value'],
        'AvgTransactionValue_2': customer2['average_transaction_value'],
        'DaysSinceSignup_1': customer1['days_since_signup'],
        'DaysSinceSignup_2': customer2['days_since_signup'],
        'SimilarityScore': similarity_score
    })

In [32]:
pairs_df = pd.DataFrame(pairs_data)

In [33]:
X = pairs_df[['TotalSpent_1', 'TotalSpent_2', 'TransactionCount_1', 'TransactionCount_2', 
              'AvgTransactionValue_1', 'AvgTransactionValue_2', 'DaysSinceSignup_1', 'DaysSinceSignup_2']]
y = pairs_df['SimilarityScore']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict(X_test)

In [37]:
customer_summary= customer_summary.head(20)

lookalike_recommendations = {}

for idx, customer_id in enumerate(customer_summary['CustomerID']):
    similarities = []
    
    # Get the feature vector for the current customer
    customer1 = customer_summary.loc[customer_summary['CustomerID'] == customer_id].iloc[0]
    customer1_features = np.array([
        customer1['total_spent'],
        customer1['transaction_count'],
        customer1['average_transaction_value'],
        customer1['days_since_signup']
    ]).reshape(1, -1)

    # Loop over the rest of the first 20 customers
    for jdx, other_customer_id in enumerate(customer_summary['CustomerID']):
        if customer_id != other_customer_id:
            # Get the feature vector for the other customer
            customer2 = customer_summary.loc[customer_summary['CustomerID'] == other_customer_id].iloc[0]
            customer2_features = np.array([
                customer2['total_spent'],
                customer2['transaction_count'],
                customer2['average_transaction_value'],
                customer2['days_since_signup']
            ]).reshape(1, -1)
            
            pair_features = np.concatenate([customer1_features, customer2_features], axis=1)
            
            predicted_similarity = model.predict(pair_features)[0]
            similarities.append((other_customer_id, predicted_similarity))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_3_similar = similarities[:3]
    
    # Store the recommendations
    lookalike_recommendations[customer_id] = top_3_similar



In [38]:
model_score = model.score(X_test, y_test)
print(f"Model Accuracy: {model_score:.2f}")

Model Accuracy: 0.97


In [39]:

lookalike_list = []

for customer_id, recommendations in lookalike_recommendations.items():
    # Get only top 3 recommendations for each customer
    top_3_recommendations = recommendations[:3]
    
    # Add each lookalike customer and score under one group
    lookalike_data = [{'LookalikeCustomerID': lookalike_id, 'SimilarityScore': score} for lookalike_id, score in top_3_recommendations]
    
    lookalike_list.append({
        'CustomerID': customer_id,
        'Lookalikes': lookalike_data
    })

# Convert to DataFrame
lookalike_df = pd.DataFrame(lookalike_list)

# Optionally, expand the 'Lookalikes' column into multiple columns for cleaner CSV export
lookalike_df_expanded = lookalike_df.copy()
lookalike_df_expanded[['Lookalike1_ID', 'SimilarityScore1']] = pd.json_normalize(lookalike_df['Lookalikes'].apply(lambda x: x[0] if len(x) > 0 else {}))
lookalike_df_expanded[['Lookalike2_ID', 'SimilarityScore2']] = pd.json_normalize(lookalike_df['Lookalikes'].apply(lambda x: x[1] if len(x) > 1 else {}))
lookalike_df_expanded[['Lookalike3_ID', 'SimilarityScore3']] = pd.json_normalize(lookalike_df['Lookalikes'].apply(lambda x: x[2] if len(x) > 2 else {}))

# Drop the original 'Lookalikes' column as it's now split into multiple columns
lookalike_df_expanded = lookalike_df_expanded.drop('Lookalikes', axis=1)

# Save to CSV
lookalike_df_expanded.to_csv('Lookalike.csv', index=False)