# **TASK 2 : Lookalike Model**

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np

In [2]:
# Step 1: Load the data
customers_path = "Customers.csv"
products_path = "Products.csv"
transactions_path = "Transactions.csv"

customers = pd.read_csv(customers_path)
products = pd.read_csv(products_path)
transactions = pd.read_csv(transactions_path)

In [3]:
# Step 2: Preprocess data
# Encode categorical variables (Region, Category) and calculate customer-level features
label_encoder = LabelEncoder()
customers['Region'] = label_encoder.fit_transform(customers['Region'])

# Aggregate transactions to get customer-level features
customer_transactions = transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_quantity=('Quantity', 'sum'),
    avg_price=('Price', 'mean'),
    transaction_count=('TransactionID', 'count')
).reset_index()

# Merge with customer data
customer_data = pd.merge(customers, customer_transactions, on='CustomerID', how='left')
customer_data.fillna(0, inplace=True)

In [4]:
# Step 3: Standardize numerical features
numerical_features = ['total_spent', 'total_quantity', 'avg_price', 'transaction_count']
scaler = StandardScaler()
customer_data[numerical_features] = scaler.fit_transform(customer_data[numerical_features])

In [5]:
# Step 4: Calculate similarity using cosine similarity
customer_features = customer_data[['Region', 'total_spent', 'total_quantity', 'avg_price', 'transaction_count']]
similarity_matrix = cosine_similarity(customer_features)

In [6]:
# Step 5: Find top 3 similar customers for the first 20 customers
lookalike_results = {}
first_20_customers = customer_data['CustomerID'][:20]
for customer_id in first_20_customers:
    idx = customer_data[customer_data['CustomerID'] == customer_id].index[0]
    # Get similarity scores for this customer
    similarities = list(enumerate(similarity_matrix[idx]))
    # Sort by similarity score in descending order (excluding self)
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    top_similar = [(customer_data['CustomerID'][i], score) for i, score in similarities if i != idx][:3]
    lookalike_results[customer_id] = top_similar

In [7]:
# Step 6: Save results to CSV
lookalike_list = []
for customer_id, similar_customers in lookalike_results.items():
    lookalike_list.append({
        "cust_id": customer_id,
        "similarity_score": similar_customers
    })

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv("PYDIMUDDALA_YAGNITHA_Lookalike.csv", index=False)

print("Lookalike model results saved to Lookalike.csv")

Lookalike model results saved to Lookalike.csv
