In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [8]:
customers_path = r'C:\Users\saihg\Downloads\Customers.csv'
products_path = r'C:\Users\saihg\Downloads\Products.csv'
transactions_path = r'C:\Users\saihg\Downloads\Transactions.csv'

customers_df = pd.read_csv(customers_path)
products_df = pd.read_csv(products_path)
transactions_df = pd.read_csv(transactions_path)

customers_df.head(), products_df.head(), transactions_df.head()


(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  2024-04-25 07:38:55    

In [9]:
# Merge transactions with products to get product details
transactions_products = transactions_df.merge(products_df, on="ProductID")

# Merge the resulting dataset with customers to get customer details
merged_data = transactions_products.merge(customers_df, on="CustomerID")

# Inspect the merged dataset
merged_data.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00761,C0199,P022,2024-10-01 05:57:09,4,550.16,137.54,HomeSense Wall Art,Home Decor,137.54,Andrea Jenkins,Europe,2022-12-03
2,T00626,C0199,P079,2024-08-17 12:06:08,2,834.74,417.37,ActiveWear Rug,Home Decor,417.37,Andrea Jenkins,Europe,2022-12-03
3,T00963,C0199,P008,2024-10-26 00:01:58,2,293.7,146.85,BookWorld Bluetooth Speaker,Electronics,146.85,Andrea Jenkins,Europe,2022-12-03
4,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04


In [10]:
# Aggregating data by CustomerID to compute customer profiles
customer_features = merged_data.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    num_transactions=("TransactionID", "count"),
    num_unique_products=("ProductID", "nunique"),
    product_categories=("Category", lambda x: x.nunique()),
    region=("Region", "first"),  # Region is the same per customer
    signup_date=("SignupDate", "first"),  # Signup date is also unique per customer
).reset_index()

customer_features["signup_date"] = pd.to_datetime(customer_features["signup_date"])
customer_features["days_since_signup"] = (
    pd.Timestamp.now() - customer_features["signup_date"]
).dt.days

customer_features = pd.get_dummies(customer_features, columns=["region"], drop_first=True)

customer_features = customer_features.drop(columns=["signup_date"])

customer_features.head()


Unnamed: 0,CustomerID,total_spent,num_transactions,num_unique_products,product_categories,days_since_signup,region_Europe,region_North America,region_South America
0,C0001,3354.52,5,5,3,932,False,False,True
1,C0002,1862.74,4,4,2,1079,False,False,False
2,C0003,2725.38,4,4,3,326,False,False,True
3,C0004,5354.88,8,8,3,841,False,False,True
4,C0005,2034.24,3,3,2,896,False,False,False


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

features = customer_features.drop(columns=["CustomerID"])

similarity_matrix = cosine_similarity(features.values)

# Map similarity scores for the first 20 customers (C0001 - C0020)
customer_ids = customer_features["CustomerID"]
first_20_ids = customer_ids[:20]

lookalike_map = {}

for idx, cust_id in enumerate(first_20_ids):
    scores = similarity_matrix[idx]
    
    similar_indices = np.argsort(scores)[::-1][1:4]  # Top 3 (excluding self)
    similar_customers = [
        (customer_ids[i], scores[i]) for i in similar_indices
    ]
    
    lookalike_map[cust_id] = similar_customers

lookalike_df = pd.DataFrame({
    "CustomerID": list(lookalike_map.keys()),
    "Lookalikes": [str(value) for value in lookalike_map.values()],
})

output_file = r"C:\Users\saihg\Downloads\Lookalike.csv"
lookalike_df.to_csv(output_file, index=False)

output_file


'C:\\Users\\saihg\\Downloads\\Lookalike.csv'