# Task 2: Lookalike Model

In [2]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
products_csv = "/kaggle/input/assignment/data/Products.csv"
customers_csv = "/kaggle/input/assignment/data/Customers.csv"
transactions_csv = "/kaggle/input/assignment/data/Transactions.csv"

products = pd.read_csv(products_csv)
customers = pd.read_csv(customers_csv)
transactions = pd.read_csv(transactions_csv)

In [4]:
customer_transactions = transactions.merge(customers, on='CustomerID')
print(customer_transactions.shape,"\t", customer_transactions.columns)

(1000, 10) 	 Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price', 'CustomerName', 'Region',
       'SignupDate'],
      dtype='object')


* ####  Combining data of each customer (summation of transactions)

In [5]:
# Aggregating Transaction Data by Customer
customer_transactions = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',  
    'Quantity': 'sum',  
    'ProductID': 'nunique',  # unique products purchased
    'TransactionID': 'count'  
}).reset_index()

# Merging with customer information on Customer_ID
customer_data = customers.merge(customer_transactions, on='CustomerID')

# One-hot encode categorical features (Region)
encoder = OneHotEncoder(sparse_output=False)
region_encoded = encoder.fit_transform(customer_data[['Region']])

# combining encoded region data with other features
customer_features = np.hstack([region_encoded, customer_data[['TotalValue', 'Quantity', 'ProductID', 'TransactionID']].values])

In [None]:
similarity_matrix = cosine_similarity(customer_features)

similarity_data = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

lookalike_map = {}

for cust_id in customer_data['CustomerID'][:20]: 
    
    similarity_scores = similarity_data[cust_id]
    sorted_similarities = similarity_scores.sort_values(ascending=False)[1:].head(3)  
    lookalike_map[cust_id] = list(zip(sorted_similarities.index, sorted_similarities.values))

lookalike_data = pd.DataFrame(list(lookalike_map.items()), columns=['CustomerID', 'Lookalikes'])

lookalike_data.to_csv('Lookalike.csv', index=False)