In [15]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [29]:
#Step 1: Load the datasets
# Load data
customers_df = pd.read_csv("Customers (1).csv")
products_df = pd.read_csv("Products (1).csv")
transactions_df = pd.read_csv("Transactions.csv")

In [30]:
print(customers_df.columns)

Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')


In [31]:
print(transactions_df.columns)

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')


In [33]:
# Step 2: Merge datasets
merged_df = transactions_df.merge(customers_df, on="CustomerID", how="left").merge(products_df, on="ProductID", how="left")

In [34]:
# Display the first few rows of the merged dataset
print("Merged Dataset:")
display(merged_df.head())

Merged Dataset:


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [35]:
# Step 3: Feature Engineering
# Aggregate transaction data to build customer profiles
customer_profiles = merged_df.groupby("CustomerID").agg(
    total_spend=("TotalValue", "sum"),
    transaction_count=("TransactionID", "count"),
    avg_transaction_value=("TotalValue", "mean"),
    product_categories=("Category", lambda x: x.value_counts().to_dict())
).reset_index()

In [36]:
# Display the aggregated customer profiles
print("\nCustomer Profiles:")
display(customer_profiles.head())


Customer Profiles:


Unnamed: 0,CustomerID,total_spend,transaction_count,avg_transaction_value,product_categories
0,C0001,3354.52,5,670.904,"{'Electronics': 3, 'Books': 1, 'Home Decor': 1}"
1,C0002,1862.74,4,465.685,"{'Home Decor': 2, 'Clothing': 2}"
2,C0003,2725.38,4,681.345,"{'Home Decor': 2, 'Clothing': 1, 'Electronics'..."
3,C0004,5354.88,8,669.36,"{'Books': 3, 'Home Decor': 3, 'Electronics': 2}"
4,C0005,2034.24,3,678.08,"{'Electronics': 2, 'Home Decor': 1}"


In [37]:
# Step 4: Normalize numerical features
scaler = MinMaxScaler()
customer_profiles[["total_spend", "transaction_count", "avg_transaction_value"]] = scaler.fit_transform(
    customer_profiles[["total_spend", "transaction_count", "avg_transaction_value"]]
)

In [38]:
# Step 5: Vectorize product category distributions
dict_vectorizer = DictVectorizer(sparse=False)
product_categories_vectorized = dict_vectorizer.fit_transform(customer_profiles["product_categories"])

In [39]:
# Combine numerical and categorical features into a single feature matrix
numeric_features = customer_profiles[["total_spend", "transaction_count", "avg_transaction_value"]].values
combined_features = np.hstack([numeric_features, product_categories_vectorized])

In [40]:
# Step 6: Compute similarity matrix
similarity_matrix = cosine_similarity(combined_features)

In [41]:
# Step 7: Generate Lookalike Recommendations
lookalike_map = {}
customer_ids = customer_profiles["CustomerID"].values

for idx, customer_id in enumerate(customer_ids[:20]):  # For the first 20 customers
    similarity_scores = similarity_matrix[idx]
    similar_customers = [
        (customer_ids[i], similarity_scores[i])
        for i in range(len(customer_ids)) if i != idx
    ]
    similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)[:3]
    lookalike_map[customer_id] = similar_customers


In [42]:
# Convert recommendations to a DataFrame
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": str(lookalikes)}
    for cust_id, lookalikes in lookalike_map.items()
])

In [43]:
# Save the recommendations to a CSV file
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)

In [44]:
# Display the lookalike results
print("\nLookalike Recommendations for First 20 Customers:")
display(lookalike_df)


Lookalike Recommendations for First 20 Customers:


Unnamed: 0,cust_id,lookalikes
0,C0001,"[('C0146', 0.9846766203196576), ('C0035', 0.98..."
1,C0002,"[('C0133', 0.9969748280054627), ('C0134', 0.98..."
2,C0003,"[('C0166', 0.9996168521701857), ('C0031', 0.99..."
3,C0004,"[('C0047', 0.9771435420876702), ('C0172', 0.96..."
4,C0005,"[('C0197', 0.9999185648817964), ('C0007', 0.99..."
5,C0006,"[('C0135', 0.9959553423113117), ('C0187', 0.95..."
6,C0007,"[('C0005', 0.9979085027158004), ('C0197', 0.99..."
7,C0008,"[('C0181', 0.9795565955138308), ('C0162', 0.97..."
8,C0009,"[('C0092', 0.9847598909469107), ('C0049', 0.96..."
9,C0010,"[('C0077', 0.9898101503056507), ('C0083', 0.98..."
