In [2]:
# Import required libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

# Merge datasets
data = transactions.merge(customers, on="CustomerID", how="left")
data = data.merge(products, on="ProductID", how="left")

# Feature Engineering
# Aggregate transaction data for each customer
customer_features = data.groupby("CustomerID").agg(
    total_spending=pd.NamedAgg(column="TotalValue", aggfunc="sum"),
    avg_spending=pd.NamedAgg(column="TotalValue", aggfunc="mean"),
    total_transactions=pd.NamedAgg(column="TransactionID", aggfunc="count"),
    preferred_category=pd.NamedAgg(column="Category", aggfunc=lambda x: x.mode()[0] if not x.mode().empty else None)
).reset_index()

# Encode categorical features
customer_features = customer_features.merge(customers[["CustomerID", "Region"]], on="CustomerID", how="left")
customer_features = pd.get_dummies(customer_features, columns=["preferred_category", "Region"], drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ["total_spending", "avg_spending", "total_transactions"]
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

# Prepare data for similarity computation
# Set "CustomerID" as the index but do not attempt to drop it afterward
feature_matrix = customer_features.set_index("CustomerID")

# Compute the similarity matrix
similarity_matrix = cosine_similarity(feature_matrix)

# Create a DataFrame to hold the similarity matrix with CustomerID as both rows and columns
customer_ids = customer_features["CustomerID"].tolist()
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)


# Generate Lookalike recommendations for customers C0001 to C0020
lookalikes = {}
for customer_id in customer_ids[:20]:
    # Sort similarity scores in descending order, excluding the customer itself
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))

# Save lookalikes to Lookalike.csv
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": str(lookalike_list)}
    for cust_id, lookalike_list in lookalikes.items()
])
lookalike_df.to_csv("Lookalike.csv", index=False)

# Display a sample of Lookalike.csv
print(lookalike_df.head(10))

# Save as an output file
lookalike_df.to_csv("Lookalike.csv", index=False)
print("Lookalike recommendations saved as 'Lookalike.csv'")


  cust_id                                         lookalikes
0   C0001  [('C0190', 0.968215451295126), ('C0048', 0.941...
1   C0002  [('C0088', 0.9544024281819798), ('C0134', 0.92...
2   C0003  [('C0052', 0.9847977904024423), ('C0152', 0.92...
3   C0004  [('C0165', 0.9738306048704476), ('C0155', 0.95...
4   C0005  [('C0186', 0.9717374531257306), ('C0146', 0.94...
5   C0006  [('C0168', 0.978264001321209), ('C0171', 0.938...
6   C0007  [('C0140', 0.9798101409546568), ('C0115', 0.92...
7   C0008  [('C0139', 0.9135397162249627), ('C0194', 0.87...
8   C0009  [('C0010', 0.9760669630706748), ('C0198', 0.95...
9   C0010  [('C0009', 0.9760669630706748), ('C0111', 0.97...
Lookalike recommendations saved as 'Lookalike.csv'
