# Import Libraries

In [27]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

# Reading CSV datasets and create merged dataset

In [11]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")
merged_df = transactions.merge(customers, on="CustomerID", how="left").merge(
    products, on="ProductID", how="left"
)

# Prepare aggregated features per customer

In [13]:
features_df = (
    merged_df.groupby("CustomerID")
    .agg(
        {
            "TotalValue": "sum",
            "Quantity": "sum",
            "Region": "first",
            "Category": lambda x: x.value_counts().idxmax(),
        }
    )
    .reset_index()
)

# Encode region as numeric for similarity
features_df = pd.get_dummies(features_df, columns=["Region", "Category"])

In [17]:
features_df.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,Region_Asia,Region_Europe,Region_North America,Region_South America,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,0.308942,0.354839,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,C0002,0.168095,0.290323,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,C0003,0.249541,0.419355,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,C0004,0.497806,0.709677,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,C0005,0.184287,0.193548,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Scale numerical features to increase model performance

In [14]:
scaler = MinMaxScaler()
num_cols = (
    ["TotalValue", "Quantity"]
    + [col for col in features_df.columns if "Region_" in col]
    + [col for col in features_df.columns if "Category_" in col]
)
features_df[num_cols] = scaler.fit_transform(features_df[num_cols])

# Model Fitting and Prediction

### Use Nearest Neighbor model to get customer similiarity in unsupervised learning (Use `n_neighbors`=4 to get 3 similar customers for each customer)

In [15]:
knn = NearestNeighbors(
    n_neighbors=4, metric="euclidean"
)
knn.fit(features_df[num_cols])

### Get top 3 lookalike customers for each customer

In [16]:
def get_lookalikes(customer_id):
    row_index = features_df[features_df["CustomerID"] == customer_id].index[0]
    distances, indices = knn.kneighbors(
        features_df.loc[[row_index], num_cols], n_neighbors=4
    )
    lookalike_customers = []
    for dist, idx in zip(distances[0], indices[0]):
        if idx != row_index:  # exclude the target itself
            lookalike_customers.append((features_df.loc[idx, "CustomerID"], dist))
    lookalike_customers.sort(key=lambda x: x[1])  # sort by distance
    return lookalike_customers[:3]

### Generate Lookalike.csv for first 20 customers

In [31]:
lookalike_records = []
first_20_customers = customers["CustomerID"].head(20).tolist()

for cust_id in first_20_customers:
    neighbors = get_lookalikes(cust_id)
    lookalike_tuples = []
    for neighbor_id, dist in neighbors:
        similarity_score = float(1 / (1 + dist))
        lookalike_tuples.append((neighbor_id, similarity_score))
    lookalike_records.append({"CustomerID": cust_id, "Lookalikes": lookalike_tuples})

# Create DataFrame and save to CSV
lookalike_df = pd.DataFrame(lookalike_records)
lookalike_df["Lookalikes"] = lookalike_df["Lookalikes"].apply(
    lambda x: str(x).replace("'", "").replace('"', "")
)
lookalike_df.to_csv("Aritra_Bhaduri_Lookalike.csv", index=False, float_format="%.6f")