In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
Customers_file_url = "https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE"

customers = pd.read_csv(Customers_file_url)

# Display the DataFrame
print(customers.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15


In [4]:
file1_url = "https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0"
file2_url = "https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF"

products = pd.read_csv(file1_url)

print(products.head())

  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31


In [5]:
transactions = pd.read_csv(file2_url)
print( transactions.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  


In [6]:
# Merge Datasets
merged_data = pd.merge(transactions, customers, on="CustomerID", how="left")
merged_data = pd.merge(merged_data, products, on="ProductID", how="left")

Feature Engineering

In [7]:
# Total Spending Per Customer
customer_spending = (
    merged_data.groupby("CustomerID")["TotalValue"]
    .sum()
    .reset_index()
    .rename(columns={"TotalValue": "TotalSpending"})
)

In [8]:
# Average Transaction Value
avg_transaction_value = (
    merged_data.groupby("CustomerID")["TotalValue"]
    .mean()
    .reset_index()
    .rename(columns={"TotalValue": "AvgTransactionValue"})
)

In [9]:
# Total Products Purchased
total_products = (
    merged_data.groupby("CustomerID")["Quantity"]
    .sum()
    .reset_index()
    .rename(columns={"Quantity": "TotalProducts"})
)

In [10]:
# Merge Features into Customer Profile
customer_features = pd.merge(customer_spending, avg_transaction_value, on="CustomerID")
customer_features = pd.merge(customer_features, total_products, on="CustomerID")


In [11]:
# Adding Region as a One-Hot Encoded Feature
region_encoded = pd.get_dummies(customers[["CustomerID", "Region"]], columns=["Region"])
customer_features = pd.merge(customer_features, region_encoded, on="CustomerID", how="left")

In [14]:
# Scale Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])


**Calculate Similarity**

In [16]:
# Cosine Similarity
similarity_matrix = cosine_similarity(scaled_features)


In [17]:
# Create Lookalike Recommendations
lookalike_map = {}
customer_ids = customer_features["CustomerID"].tolist()


In [18]:
for i, customer_id in enumerate(customer_ids):
    # Get Similarity Scores for the Current Customer
    similarity_scores = list(enumerate(similarity_matrix[i]))

    # Exclude Self and Sort by Similarity
    similarity_scores = sorted(
        [(customer_ids[j], score) for j, score in similarity_scores if j != i],
        key=lambda x: x[1],
        reverse=True,
    )

    # Store Top 3 Recommendations
    lookalike_map[customer_id] = similarity_scores[:3]



Generate Lookalike.csv

In [19]:
# ------------------------------
# Generate Lookalike.csv
# ------------------------------

# Create Output DataFrame
lookalike_data = []

for customer_id, recommendations in lookalike_map.items():
    for similar_customer, score in recommendations:
        lookalike_data.append(
            {"CustomerID": customer_id, "SimilarCustomerID": similar_customer, "Score": score}
        )

lookalike_df = pd.DataFrame(lookalike_data)



In [20]:
# Save the First 20 Customers to Lookalike.csv
lookalike_filtered = lookalike_df[lookalike_df["CustomerID"].isin(customer_ids[:20])]
lookalike_filtered.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv generated successfully!")

Lookalike.csv generated successfully!


**Output for Verification**

In [21]:
# ------------------------------
# Output for Verification
# ------------------------------

# Display Top 3 Recommendations for the First 20 Customers
for customer_id in customer_ids[:20]:
    print(f"CustomerID: {customer_id}")
    print("Top 3 Similar Customers:")
    for similar_customer, score in lookalike_map[customer_id]:
        print(f"   - {similar_customer}: {score:.4f}")
    print()

CustomerID: C0001
Top 3 Similar Customers:
   - C0137: 0.9960
   - C0107: 0.9890
   - C0152: 0.9846

CustomerID: C0002
Top 3 Similar Customers:
   - C0088: 0.9963
   - C0142: 0.9846
   - C0043: 0.9682

CustomerID: C0003
Top 3 Similar Customers:
   - C0190: 0.9791
   - C0001: 0.9671
   - C0133: 0.9607

CustomerID: C0004
Top 3 Similar Customers:
   - C0113: 0.9877
   - C0102: 0.9687
   - C0012: 0.9674

CustomerID: C0005
Top 3 Similar Customers:
   - C0159: 0.9978
   - C0146: 0.9897
   - C0186: 0.9872

CustomerID: C0006
Top 3 Similar Customers:
   - C0168: 0.9715
   - C0187: 0.9690
   - C0171: 0.9594

CustomerID: C0007
Top 3 Similar Customers:
   - C0140: 0.9753
   - C0092: 0.9709
   - C0193: 0.9589

CustomerID: C0008
Top 3 Similar Customers:
   - C0034: 0.9456
   - C0024: 0.9142
   - C0194: 0.9107

CustomerID: C0009
Top 3 Similar Customers:
   - C0014: 0.9958
   - C0119: 0.9854
   - C0060: 0.9837

CustomerID: C0010
Top 3 Similar Customers:
   - C0019: 0.9869
   - C0172: 0.9810
   - C0199