In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


# Load data

In [2]:
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")

# Merge datasets

In [3]:
transactions_products = pd.merge(transactions_df, products_df, on="ProductID", how="left")
full_data = pd.merge(transactions_products, customers_df, on="CustomerID", how="left")


# Feature engineering

In [4]:
customer_total_spend = full_data.groupby("CustomerID")["TotalValue"].sum()
customer_purchase_frequency = full_data.groupby("CustomerID")["TransactionID"].count()
customer_avg_spend = customer_total_spend / customer_purchase_frequency


# One-hot encode product categories

In [5]:
category_dummies = pd.get_dummies(full_data["Category"], prefix="Category")
full_data_with_dummies = pd.concat([full_data, category_dummies], axis=1)
category_preferences = full_data_with_dummies.groupby("CustomerID")[category_dummies.columns].sum()

# Combine all features

In [6]:
customer_profiles = pd.DataFrame({
    "TotalSpend": customer_total_spend,
    "PurchaseFrequency": customer_purchase_frequency,
    "AvgSpend": customer_avg_spend
}).join(category_preferences)

# Normalize features

In [7]:
scaler = StandardScaler()
customer_profiles_normalized = pd.DataFrame(
    scaler.fit_transform(customer_profiles),
    index=customer_profiles.index,
    columns=customer_profiles.columns
)

# Compute cosine similarity

In [8]:
similarity_matrix = cosine_similarity(customer_profiles_normalized)
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=customer_profiles_normalized.index,
    columns=customer_profiles_normalized.index
)

# Find top 3 lookalikes for each customer

In [9]:
def get_top_lookalikes(customer_id, top_n=3):
    scores = similarity_df.loc[customer_id].drop(customer_id)
    top_customers = scores.nlargest(top_n).index
    top_scores = scores.nlargest(top_n).values
    return list(zip(top_customers, top_scores))

lookalike_map = {
    customer_id: get_top_lookalikes(customer_id)
    for customer_id in [f"C{str(i).zfill(4)}" for i in range(1, 21)]
}

# Save lookalike map to CSV

In [10]:
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_map.keys(),
    "Lookalikes": [str(lookalike_map[cust_id]) for cust_id in lookalike_map.keys()]
})

lookalike_df.to_csv("Lookalike.csv", index=False)
print("Lookalike.csv generated successfully.")

Lookalike.csv generated successfully.
