# Importing necessary libraries

In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Loading and merging the datasets

In [6]:
transactions_path = 'Transactions (1).csv'
products_path = 'Products.csv'
customers_path = 'Customers.csv'

transactions_df = pd.read_csv(transactions_path)
products_df = pd.read_csv(products_path)
customers_df = pd.read_csv(customers_path)

# Merge datasets for unified analysis
merged_df = transactions_df.merge(products_df, on='ProductID').merge(customers_df, on='CustomerID')

# Applying Feature Engineering

In [7]:
customer_features = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total revenue per customer
    'Quantity': 'sum',    # Total quantity purchased
    'Category': lambda x: x.value_counts().index[0],  # Most purchased category
    'Price_x': 'mean',      # Average price of products bought
}).reset_index()

# Applying one-hot encoding for categorical features

In [8]:
customer_features = pd.get_dummies(customer_features, columns=['Category'])

# Standardize the features for similarity calculation
scaler = StandardScaler()
customer_data_scaled = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_data_scaled)

# Create lookalike recommendations for first 20 customers
lookalikes = {}
customer_ids = customer_features['CustomerID'].values

for i, customer_id in enumerate(customer_ids[:20]):
    # Get similarity scores for this customer
    similarity_scores = list(enumerate(similarity_matrix[i]))
    # Sort by similarity score in descending order, exclude self (score = 1.0)
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[idx], round(score, 2)) for idx, score in sorted_scores[1:4]]  # Top 3 similar customers
    lookalikes[customer_id] = top_3


# Saving into CSV file

In [9]:
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalikes.keys()),
    'Lookalikes': [str(val) for val in lookalikes.values()]
})
lookalike_df.to_csv('Aranyak_Banerjee_Lookalike.csv', index=False)
