In [None]:
# Task 2: Lookalike Model for eCommerce Transactions Dataset

# Importing necessary libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: Load the datasets
# Customers, Products, and Transactions datasets are loaded into separate DataFrames
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Step 2: Data Preprocessing
# Merging the datasets to create a comprehensive dataset for analysis
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Aggregating transaction data for each customer
customer_transactions = merged_data.groupby("CustomerID").agg({
    "Quantity": "sum",
    "TotalValue": "sum",
    "ProductID": lambda x: list(x.unique())  # List of unique products purchased
}).reset_index()
print(customer_transactions.head())

# Ensuring customer_transactions matches the length of customers by merging with the customer DataFrame
customer_transactions = customers.merge(customer_transactions, on="CustomerID", how="left").fillna(0)

# Step 3: One-Hot Encoding the Region column
# Encoding categorical data in the Region column
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(customers[["Region"]]).toarray()

# Creating a DataFrame for encoded Region data
region_encoded_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(["Region"]))

# Combining customer features with encoded Region and aggregated transaction data
customer_features = pd.concat([
    customers["CustomerID"],
    region_encoded_df,
    customer_transactions[["Quantity", "TotalValue"]]
], axis=1).fillna(0)

# Step 4: Standardizing Numerical Features
# Standardizing Quantity and TotalValue for better similarity calculations
scaler = StandardScaler()
numerical_features = ["Quantity", "TotalValue"]
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

# Step 5: Calculating Cosine Similarity
# Setting CustomerID as the index and calculating similarity between customers
customer_features.set_index("CustomerID", inplace=True)
similarity_matrix = cosine_similarity(customer_features)

# Step 6: Generating Recommendations for the First 20 Customers
# Recommending top 3 lookalike customers for the first 20 customers
lookalike_recommendations = {}
customer_ids = customers["CustomerID"].tolist()

for idx, customer_id in enumerate(customer_ids[:20]):  # Processing the first 20 customers
    # Getting similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    # Sorting scores to find the top 3 most similar customers (excluding the customer itself)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    
    # Mapping Customer IDs to their similarity scores
    lookalike_recommendations[customer_id] = [
        (customer_ids[score[0]], score[1]) for score in similarity_scores
    ]

# Step 7: Saving Recommendations to a CSV File
# Converting recommendations into a DataFrame for saving and further analysis
lookalike_data = []
for cust_id, recommendations in lookalike_recommendations.items():
    for rec in recommendations:
        lookalike_data.append([cust_id, rec[0], rec[1]])

lookalike_df = pd.DataFrame(lookalike_data, columns=["CustomerID", "LookalikeID", "SimilarityScore"])
lookalike_df.to_csv("Lookalike.csv", index=False)

# Output the recommendations
print("Lookalike recommendations saved to 'Abilash_K S_Lookalike.csv'.")
print(lookalike_df.head())



  CustomerID  Quantity  TotalValue  \
0      C0001        12     3354.52   
1      C0002        10     1862.74   
2      C0003        14     2725.38   
3      C0004        23     5354.88   
4      C0005         7     2034.24   

                                          ProductID  
0                    [P054, P022, P096, P083, P029]  
1                          [P095, P004, P019, P071]  
2                          [P025, P006, P035, P002]  
3  [P049, P053, P038, P025, P097, P024, P008, P077]  
4                                [P025, P039, P012]  
Lookalike recommendations saved to 'Abilash_K S_Lookalike.csv'.
  CustomerID LookalikeID  SimilarityScore
0      C0001       C0107         0.989475
1      C0001       C0137         0.987899
2      C0001       C0184         0.987695
3      C0002       C0088         0.996018
4      C0002       C0142         0.987820
