In [27]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load the datasets
transactions_df = pd.read_csv('TASK2/Transactions.csv')
customers_df = pd.read_csv('TASK2/Customers.csv')
products_df = pd.read_csv('TASK2/Products.csv')


In [28]:
# Merge datasets to create a unified dataset
merged_df = transactions_df.merge(products_df, on="ProductID", how="left")
merged_df = merged_df.merge(customers_df, on="CustomerID", how="left")

# Calculate customer-level features (aggregate transaction history)
customer_features = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',          # Total spending
    'Quantity': 'sum',            # Total products purchased
    'Category': lambda x: x.mode()[0],  # Most frequent product category
    'Region': lambda x: x.mode()[0]     # Most common region
}).reset_index()

In [29]:
# Encode categorical features (e.g., Category, Region)
customer_features = pd.get_dummies(customer_features, columns=['Category', 'Region'], drop_first=True)

# Standardize numeric features for similarity calculation
scaler = StandardScaler()
numeric_columns = ['TotalValue', 'Quantity']
customer_features[numeric_columns] = scaler.fit_transform(customer_features[numeric_columns])

# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features.iloc[:, 1:])
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [30]:
# Function to get top 3 similar customers for a given customer
def get_top_similar(customers, customer_id, n=3):
    similar_customers = customers.loc[customer_id].sort_values(ascending=False)[1:n+1]
    return [(idx, round(score, 4)) for idx, score in similar_customers.items()]

# Generate Lookalike map for the first 20 customers
lookalike_map = {}
for customer_id in customers_df['CustomerID'][:20]:
    lookalike_map[customer_id] = get_top_similar(similarity_df, customer_id)

# Convert lookalike map into a DataFrame for saving
lookalike_df = pd.DataFrame([
    {'cust_id': cust_id, 'lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])


In [31]:
# Save the results to a CSV file
lookalike_df.to_csv("Lookalike.csv", index=False)

# -------------------
# Summary and Explanation:
# -------------------
print("Lookalike Map for First 20 Customers:")
print(lookalike_df.head(20))


Lookalike Map for First 20 Customers:
   cust_id                                         lookalikes
0    C0001  [(C0184, 0.9935), (C0048, 0.982), (C0190, 0.96...
1    C0002  [(C0088, 0.9961), (C0092, 0.8988), (C0077, 0.8...
2    C0003  [(C0076, 0.9404), (C0052, 0.9075), (C0031, 0.8...
3    C0004  [(C0165, 0.984), (C0169, 0.981), (C0087, 0.9627)]
4    C0005  [(C0186, 0.9969), (C0140, 0.9912), (C0146, 0.9...
5    C0006  [(C0187, 0.979), (C0126, 0.9785), (C0011, 0.95...
6    C0007   [(C0146, 1.0), (C0005, 0.9847), (C0115, 0.9749)]
7    C0008  [(C0065, 0.8535), (C0136, 0.8478), (C0059, 0.8...
8    C0009   [(C0198, 1.0), (C0061, 0.9675), (C0062, 0.9583)]
9    C0010  [(C0062, 0.9396), (C0111, 0.9244), (C0103, 0.8...
10   C0011  [(C0006, 0.9571), (C0137, 0.9276), (C0126, 0.9...
11   C0012  [(C0163, 0.9927), (C0104, 0.9716), (C0113, 0.9...
12   C0013  [(C0099, 0.986), (C0108, 0.961), (C0155, 0.8791)]
13   C0014  [(C0060, 0.9994), (C0097, 0.9353), (C0128, 0.9...
14   C0015  [(C0131, 0.982), (C0