In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Load the data from CSV files
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
# Encoding the 'Region' feature using LabelEncoder
le = LabelEncoder()
customers['Region_encoded'] = le.fit_transform(customers['Region'])

In [4]:
# Feature 1: Days since signup
customers['Days_since_signup'] = (pd.to_datetime('today') - pd.to_datetime(customers['SignupDate'])).dt.days

# Merge transactions with products to get the Price and Category information
merged_data = pd.merge(transactions, products[['ProductID', 'Price', 'Category']], on='ProductID', how='left')

# Aggregate transaction data for each customer
customer_transactions = merged_data.groupby('CustomerID').agg(
    total_purchases=('TransactionID', 'count'),
    total_spending=('TotalValue', 'sum'),
    avg_purchase_amount=('TotalValue', 'mean'),
    most_frequent_category=('Category', lambda x: x.mode()[0])
).reset_index()

# Merge the customer profile features with transaction features
customer_profiles = customers[['CustomerID', 'Region_encoded', 'Days_since_signup']].merge(customer_transactions, on='CustomerID')

# Create a feature vector for each customer by combining profile and transaction features
customer_profiles['profile_vector'] = customer_profiles[['Region_encoded', 'Days_since_signup', 'total_purchases', 'total_spending', 'avg_purchase_amount']].apply(lambda x: x.values, axis=1)

# Normalize the feature vectors for similarity calculation
normalized_profiles = np.array([np.array(vec) / np.linalg.norm(vec) if np.linalg.norm(vec) != 0 else vec for vec in customer_profiles['profile_vector']])

In [5]:
# Calculate cosine similarity between each customer and all other customers
similarity_matrix = cosine_similarity(normalized_profiles)

# Create a DataFrame to store the similarity scores for each customer
lookalike_data = {}

In [6]:
# Loop over the first 20 customers and get top 3 lookalikes
for idx, customer_id in enumerate(customer_profiles['CustomerID'][:20]):
    similarities = similarity_matrix[idx]
    top_3_indices = similarities.argsort()[-4:-1][::-1]  # Exclude the customer itself (index 0)
    top_3_similar_customers = [(customer_profiles['CustomerID'].iloc[i], similarities[i]) for i in top_3_indices]
    lookalike_data[customer_id] = top_3_similar_customers

In [7]:
# Convert the data into a DataFrame for the 'Lookalike.csv' file
lookalike_df = []
for cust_id, recommendations in lookalike_data.items():
    for rec in recommendations:
        lookalike_df.append([cust_id, rec[0], rec[1]])

lookalike_df = pd.DataFrame(lookalike_df, columns=['CustomerID', 'Lookalike_CustomerID', 'Similarity_Score'])

In [8]:
# Save the results to 'Lookalike.csv'
lookalike_df.to_csv('Lookalike.csv', index=False)

# Output a preview of the lookalike recommendations
print(lookalike_df.head())

  CustomerID Lookalike_CustomerID  Similarity_Score
0      C0001                C0174          0.999992
1      C0001                C0106          0.999918
2      C0001                C0088          0.999914
3      C0002                C0029          0.999973
4      C0002                C0025          0.999968


In [9]:
import pandas as pd
lookalike_df = pd.read_csv('Lookalike.csv')
print(lookalike_df.head(3))

  CustomerID Lookalike_CustomerID  Similarity_Score
0      C0001                C0174          0.999992
1      C0001                C0106          0.999918
2      C0001                C0088          0.999914
