In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Load datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

# Merge datasets
transactions = transactions.merge(customers, on='CustomerID', how='left')
transactions = transactions.merge(products, on='ProductID', how='left')

# Feature engineering: Aggregate customer-level data
customer_features = transactions.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    AvgTransactionValue=('TotalValue', 'mean'),
    TotalTransactions=('TransactionID', 'count'),
    AvgQuantity=('Quantity', 'mean'),
    Region=('Region', 'first')  # Keep region for categorical encoding
).reset_index()

# Encode Region using one-hot encoding
region_encoded = pd.get_dummies(customer_features['Region'], prefix='Region')
customer_features = pd.concat([customer_features, region_encoded], axis=1)
customer_features.drop('Region', axis=1, inplace=True)

# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = ['TotalSpending', 'AvgTransactionValue', 'TotalTransactions', 'AvgQuantity']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

# Compute cosine similarity
similarity_matrix = cosine_similarity(customer_features.drop(columns=['CustomerID']))

# Create Lookalike.csv
lookalikes = []
for i, customer_id in enumerate(customer_features['CustomerID']):
    # Get similarity scores for the current customer
    sim_scores = similarity_matrix[i]
    # Find top 3 most similar customers
    top_indices = sim_scores.argsort()[-4:-1][::-1]  # Exclude self-similarity
    for idx in top_indices:
        lookalikes.append({
            'CustomerID': customer_id,
            'LookalikeID': customer_features.iloc[idx]['CustomerID'],
            'SimilarityScore': sim_scores[idx]
        })

# Save results
lookalike_df = pd.DataFrame(lookalikes)
lookalike_df.to_csv('/content/Lookalike.csv', index=False)

print("Lookalike model completed. Results saved to 'Lookalike.csv'.")


Lookalike model completed. Results saved to 'Lookalike.csv'.


In [2]:
# Feature engineering: Enhanced aggregation with product preferences
product_preference = transactions.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)
customer_features = transactions.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    AvgTransactionValue=('TotalValue', 'mean'),
    TotalTransactions=('TransactionID', 'count'),
    AvgQuantity=('Quantity', 'mean'),
    Region=('Region', 'first')  # Keep region for encoding
).reset_index()

# Merge product preferences with customer features
customer_features = customer_features.merge(product_preference, on='CustomerID', how='left')

# Encode Region using one-hot encoding
region_encoded = pd.get_dummies(customer_features['Region'], prefix='Region')
customer_features = pd.concat([customer_features, region_encoded], axis=1)
customer_features.drop('Region', axis=1, inplace=True)

# Apply weights to numerical features
weights = {
    'TotalSpending': 0.4,
    'AvgTransactionValue': 0.3,
    'TotalTransactions': 0.2,
    'AvgQuantity': 0.1
}
for feature, weight in weights.items():
    customer_features[feature] *= weight

# Normalize all features
numerical_features = customer_features.select_dtypes(include=['float64', 'int64']).columns
scaler = MinMaxScaler()
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

# Compute cosine similarity
similarity_matrix = cosine_similarity(customer_features.drop(columns=['CustomerID']))

# Enhanced recommendation logic
lookalikes = []
for i, customer_id in enumerate(customer_features['CustomerID']):
    # Get similarity scores for the current customer
    sim_scores = similarity_matrix[i]
    # Find top 3 most similar customers, excluding the customer itself
    top_indices = sim_scores.argsort()[-4:-1][::-1]
    for idx in top_indices:
        lookalikes.append({
            'CustomerID': customer_id,
            'LookalikeID': customer_features.iloc[idx]['CustomerID'],
            'SimilarityScore': sim_scores[idx]
        })

# Save the enhanced lookalike results
lookalike_df = pd.DataFrame(lookalikes)
lookalike_df.to_csv('Enhanced_Lookalike.csv', index=False)

print("Enhanced Lookalike model completed. Results saved to 'Enhanced_Lookalike.csv'.")


Enhanced Lookalike model completed. Results saved to 'Enhanced_Lookalike.csv'.
