# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets

In [3]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

### Merge datasets

In [4]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Feature Engineering

### Aggregate customer transaction data

In [10]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',  # Number of transactions
    'Category': lambda x: x.value_counts().idxmax()  # Most purchased category
}).rename(columns={
    'TotalValue': 'TotalSpend',
    'TransactionID': 'PurchaseFrequency',
    'Category': 'PreferredCategory'
}).reset_index()

### Merge with customer profile

In [11]:
customer_features = customer_features.merge(customers, on='CustomerID')

### Encode categorical variables

In [12]:
customer_features['PreferredCategory'] = customer_features['PreferredCategory'].astype('category').cat.codes
customer_features['Region'] = customer_features['Region'].astype('category').cat.codes

# Normalize features

In [15]:
# Select only numerical columns for scaling
numerical_features = customer_features.drop(columns=['CustomerID', 'CustomerName', 'SignupDate'])
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(numerical_features)

# Convert normalized features back to a DataFrame for easier interpretation
normalized_features_df = pd.DataFrame(
    normalized_features,
    columns=numerical_features.columns,
    index=customer_features['CustomerID']
)

# Compute similarity matrix

In [16]:
similarity_matrix = cosine_similarity(normalized_features_df)

# Generate Lookalike Recommendations

In [18]:
lookalike_results = {}
for i, customer_id in enumerate(normalized_features_df.index):
    # Get similarity scores for all other customers
    similar_indices = np.argsort(similarity_matrix[i])[::-1][1:4]  # Top 3 similar customers
    similar_customers = [
        (normalized_features_df.index[j], similarity_matrix[i][j])
        for j in similar_indices
    ]
    lookalike_results[customer_id] = similar_customers

# Output Lookalike CSV

In [22]:
lookalike_subset = {cust: lookalike_results[cust] for cust in customer_features['CustomerID'] if cust in [f'C{str(i).zfill(4)}' for i in range(1, 21)]}

lookalike_df = pd.DataFrame([
    {'CustomerID': cust, 'Lookalikes': str(lookalikes)}
    for cust, lookalikes in lookalike_subset.items()
])
lookalike_csv_path = 'Lookalike.csv'
lookalike_df.to_csv(lookalike_csv_path, index=False)

# Display Lookalike.csv for validation
print(lookalike_df)

   CustomerID                                         Lookalikes
0       C0001  [('C0107', 0.9998220857003319), ('C0190', 0.99...
1       C0002  [('C0186', 0.9979310971407718), ('C0178', 0.99...
2       C0003  [('C0133', 0.9999493727058358), ('C0052', 0.99...
3       C0004  [('C0132', 0.9977259826289412), ('C0073', 0.99...
4       C0005  [('C0178', 0.9996714660422606), ('C0186', 0.99...
5       C0006  [('C0063', 0.999787939417756), ('C0187', 0.996...
6       C0007  [('C0115', 0.9978094704222316), ('C0005', 0.99...
7       C0008  [('C0127', 0.992103304373321), ('C0111', 0.986...
8       C0009  [('C0157', 0.9998892505829147), ('C0181', 0.99...
9       C0010  [('C0113', 0.9946633391063928), ('C0111', 0.99...
10      C0011  [('C0137', 0.9994905572343078), ('C0191', 0.99...
11      C0012  [('C0062', 0.9996385359929274), ('C0183', 0.99...
12      C0013  [('C0099', 0.9976887723054378), ('C0108', 0.99...
13      C0014  [('C0060', 0.9997760875144579), ('C0198', 0.97...
14      C0015  [('C0071',