In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
# Load the datasets
customers_df = pd.read_csv("Customers.csv")
transactions_df = pd.read_csv("Transactions.csv")
products_df = pd.read_csv("Products.csv")

## Basic Preprocessing

In [3]:
# Merge transaction data with customer data
customer_transactions = transactions_df.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique')
).reset_index()

# Merge the customer data and the transaction data
merged_df = pd.merge(customers_df, customer_transactions, on="CustomerID", how="left")

In [4]:
merged_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,total_spent,num_transactions,unique_products
0,C0001,Lawrence Carroll,South America,2022-07-10,3354.52,5.0,5.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1862.74,4.0,4.0
2,C0003,Michael Rivera,South America,2024-03-07,2725.38,4.0,4.0
3,C0004,Kathleen Rodriguez,South America,2022-10-09,5354.88,8.0,8.0
4,C0005,Laura Weber,Asia,2022-08-15,2034.24,3.0,3.0


In [5]:
merged_df.shape

(200, 7)

In [6]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        200 non-null    object 
 1   CustomerName      200 non-null    object 
 2   Region            200 non-null    object 
 3   SignupDate        200 non-null    object 
 4   total_spent       199 non-null    float64
 5   num_transactions  199 non-null    float64
 6   unique_products   199 non-null    float64
dtypes: float64(3), object(4)
memory usage: 11.1+ KB


In [12]:
merged_df.dropna(inplace=True)

In [14]:
merged_df['num_transactions'] = merged_df['num_transactions'].astype(int)
merged_df['unique_products'] = merged_df['unique_products'].astype(int)

## Feature Engineering

In [16]:
merged_df['SignupYear'] = pd.to_datetime(merged_df['SignupDate']).dt.year
merged_df['Region'] = merged_df['Region'].map({'North America': 0, 'Europe': 1, 'Asia': 2, 'South America': 3})

## Scaling

In [17]:
# Normalize transaction-based features (total_spent, num_transactions, etc.)
scaler = StandardScaler()
transaction_features = merged_df[['total_spent', 'num_transactions', 'unique_products']]
scaled_features = scaler.fit_transform(transaction_features)

In [18]:
# Combine profile features with transaction features
profile_features = merged_df[['Region', 'SignupYear']].values
combined_features = np.hstack([profile_features, scaled_features])

## cosine similarity

In [19]:
# Calculate Cosine Similarity between all customers
cosine_sim = cosine_similarity(combined_features)

In [20]:
# Create a DataFrame to store the similarity scores
similarity_df = pd.DataFrame(cosine_sim, columns=merged_df['CustomerID'], index=merged_df['CustomerID'])

In [21]:
# Function to recommend top 3 lookalike customers for each customer
def get_top_lookalikes(customer_id, top_n=3):
    # Get similarity scores for the given customer and sort by highest score (exclude self-similarity)
    similarity_scores = similarity_df[customer_id].drop(customer_id).sort_values(ascending=False).head(top_n)
    recommendations = similarity_scores.index.tolist()
    scores = similarity_scores.values.tolist()
    return recommendations, scores

In [22]:
# Recommend lookalikes for the first 20 customers (C0001 to C0020)
lookalike_map = {}
for customer_id in [f"C{str(i).zfill(4)}" for i in range(1, 21)]:
    recommendations, scores = get_top_lookalikes(customer_id)
    lookalike_map[customer_id] = [{"cust_id": rec, "score": score} for rec, score in zip(recommendations, scores)]

# Save the lookalike recommendations to a CSV
lookalike_df = pd.DataFrame([
    {'cust_id': cust_id, 'lookalikes': str(lookalikes)} for cust_id, lookalikes in lookalike_map.items()
])

# Output the result for the first 20 customers
print(lookalike_df.head(20))

   cust_id                                         lookalikes
0    C0001  [{'cust_id': 'C0137', 'score': 0.9999999999814...
1    C0002  [{'cust_id': 'C0142', 'score': 0.9999999947840...
2    C0003  [{'cust_id': 'C0133', 'score': 0.9999999990803...
3    C0004  [{'cust_id': 'C0113', 'score': 0.9999999617130...
4    C0005  [{'cust_id': 'C0159', 'score': 0.9999999999466...
5    C0006  [{'cust_id': 'C0158', 'score': 0.9999999904655...
6    C0007  [{'cust_id': 'C0159', 'score': 0.9999999905780...
7    C0008  [{'cust_id': 'C0109', 'score': 0.9999999201651...
8    C0009  [{'cust_id': 'C0197', 'score': 0.9999999610525...
9    C0010  [{'cust_id': 'C0199', 'score': 0.9999999974931...
10   C0011  [{'cust_id': 'C0107', 'score': 0.9999999995873...
11   C0012  [{'cust_id': 'C0155', 'score': 0.9999999716964...
12   C0013  [{'cust_id': 'C0087', 'score': 0.9999999866904...
13   C0014  [{'cust_id': 'C0060', 'score': 0.9999999979015...
14   C0015  [{'cust_id': 'C0144', 'score': 0.9999999991961...
15   C00

In [23]:
lookalike_df.to_csv('Lookalike.csv', index=False)