**Importing libraries**

In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

**Loading the datasets**

In [23]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge Datasets and Validate Data

In [24]:
# Merge Transactions with Products
if "ProductID" not in transactions.columns or "ProductID" not in products.columns:
    raise KeyError("ProductID column is missing in one of the datasets.")

transactions = transactions.merge(products, on="ProductID", how="left")

if "Price" not in transactions.columns:
    transactions["Price"] = 0

In [25]:
# Merge Transactions with Customers
customer_transactions = transactions.groupby("CustomerID").agg({
    "Category": lambda x: " ".join(x),  # Combine product categories
    "TotalValue": "sum",                # Total spending
    "TransactionID": "count"           # Transaction count
}).reset_index()

In [27]:
#  Enrich Customer Data
customer_data = customers.merge(customer_transactions, on="CustomerID", how="left").fillna({
    "Category": "",
    "TotalValue": 0,
    "TransactionID": 0
})

# Feature Engineering

In [28]:
category_encoded = customer_data["Category"].str.get_dummies(sep=" ")
region_encoded = pd.get_dummies(customer_data["Region"], prefix="region")

In [30]:
# Combine features
features = pd.concat([
    customer_data[["TotalValue", "TransactionID"]],
    category_encoded,
    region_encoded
], axis=1)

In [31]:
# Normalize numerical features
scaler = StandardScaler()
features[["TotalValue", "TransactionID"]] = scaler.fit_transform(features[["TotalValue", "TransactionID"]])

**Cosine Similarity**

In [32]:
similarity_matrix = cosine_similarity(features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data["CustomerID"], columns=customer_data["CustomerID"])

**Generate Lookalike Recommendations for First 20 Customers**

In [33]:
lookalike_results = {}
for customer_id in customers["CustomerID"][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Top 3 excluding self
    lookalike_results[customer_id] = [(sim_cust, round(score, 4)) for sim_cust, score in similar_customers.items()]


**Save Lookalike Results to CSV**

In [34]:
lookalike_data = []
for cust_id, lookalikes in lookalike_results.items():
    lookalike_data.append({
        "CustomerID": cust_id,
        "Lookalikes": str(lookalikes)  # Map<cust_id, List<cust_id, score>>
    })

# Final Ouput

In [35]:
lookalike_df = pd.DataFrame(lookalike_data)
output_path = "Lookalike.csv"
lookalike_df.to_csv(output_path, index=False)

# Final Output
print(f"{output_path} has been generated successfully.")
print("Sample Recommendations:")
print(lookalike_df.head(10))


Lookalike.csv has been generated successfully.
Sample Recommendations:
  CustomerID                                         Lookalikes
0      C0001  [('C0152', 1.0), ('C0174', 0.9938), ('C0085', ...
1      C0002  [('C0159', 0.9799), ('C0134', 0.9595), ('C0043...
2      C0003  [('C0031', 0.9871), ('C0129', 0.981), ('C0158'...
3      C0004  [('C0012', 0.9889), ('C0102', 0.9403), ('C0113...
4      C0005  [('C0007', 0.9922), ('C0140', 0.987), ('C0177'...
5      C0006  [('C0187', 0.9771), ('C0048', 0.8925), ('C0076...
6      C0007  [('C0005', 0.9922), ('C0140', 0.9786), ('C0177...
7      C0008  [('C0109', 0.9831), ('C0098', 0.9538), ('C0194...
8      C0009  [('C0198', 0.9866), ('C0132', 0.785), ('C0074'...
9      C0010  [('C0132', 0.9774), ('C0061', 0.9699), ('C0074...
