In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Load the datasets
customers = pd.read_csv('/Users/anshporwal/Desktop/eCommerce Analysis/Customers.csv')
products = pd.read_csv('/Users/anshporwal/Desktop/eCommerce Analysis/Products.csv')
transactions = pd.read_csv('/Users/anshporwal/Desktop/eCommerce Analysis/Transactions.csv')

# Merge customers and transactions on 'CustomerID'
data = pd.merge(transactions, customers, on='CustomerID')

# Merge data with products on 'ProductID'
data = pd.merge(data, products, on='ProductID')

# Create a pivot table with CustomerID as index and ProductName as columns, using Quantity as values
pivot_data = data.pivot_table(index='CustomerID', columns='ProductName', values='Quantity', aggfunc='sum', fill_value=0)

# Normalize the pivot table using StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(pivot_data)

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(scaled_data)

# Convert the cosine similarity matrix to a DataFrame for easy handling
cosine_sim_df = pd.DataFrame(cosine_sim, index=pivot_data.index, columns=pivot_data.index)

# Function to preprocess customer profile data
def preprocess_customer_profile(customers):
    # Encode categorical variables (e.g., 'Region')
    le = LabelEncoder()
    customers['Region'] = le.fit_transform(customers['Region'])
    
    # Handle missing values by filling NaN values with 0
    customers['Region'] = customers['Region'].fillna(0)
    
    return customers[['CustomerID', 'Region']]

# Combine profile and transaction data
profile_data = preprocess_customer_profile(customers)
profile_data.set_index('CustomerID', inplace=True)

# Merge profile data with transaction history
customer_data = pd.concat([pivot_data, profile_data], axis=1)

# Handle missing values in customer data
customer_data = customer_data.fillna(0)  # Replace NaN values with 0 for missing values

# Normalize the combined data (both transaction and profile)
combined_data = StandardScaler().fit_transform(customer_data)

# Compute the cosine similarity matrix again for the combined data
combined_cosine_sim = cosine_similarity(combined_data)

# Convert the new cosine similarity matrix to a DataFrame
combined_cosine_sim_df = pd.DataFrame(combined_cosine_sim, index=customer_data.index, columns=customer_data.index)

# Function to recommend similar customers based on profile and transaction history
def recommend_similar_customers(customer_id, top_n=3):
    # Get the similarity scores for the given customer from the combined similarity matrix
    similar_scores = combined_cosine_sim_df[customer_id]
    
    # Sort the similarity scores in descending order and exclude the customer itself (similarity=1)
    similar_scores = similar_scores.sort_values(ascending=False)
    similar_scores = similar_scores[similar_scores < 1]
    
    # Get the top_n similar customers and their similarity scores
    top_similar = similar_scores.head(top_n)
    
    return top_similar

# User Input: Get the CustomerID from the user
customer_id = input("Enter CustomerID (e.g., C0001): ").strip()

# Validate that the customer exists
if customer_id not in combined_cosine_sim_df.index:
    print("CustomerID not found. Please enter a valid CustomerID.")
else:
    # Get the top 3 similar customers for the input customer
    top_similar_customers = recommend_similar_customers(customer_id)
    
    # Display the recommendations
    print(f"Top 3 similar customers to {customer_id} are:")
    for idx, score in zip(top_similar_customers.index, top_similar_customers.values):
        print(f"CustomerID: {idx}, Similarity Score: {score}")
    
    # Save the recommendations to a CSV file
    recommendations_df = pd.DataFrame({
        'CustomerID': [customer_id] * len(top_similar_customers),
        'SimilarCustomerID': top_similar_customers.index,
        'SimilarityScore': top_similar_customers.values
    })
    
    # Save to CSV file
    recommendations_df.to_csv('Ansh_Porwal_Lookalike.csv', index=False)
    print("Recommendations saved to Ansh_Porwal_Lookalike.csv")


Top 3 similar customers to C0002 are:
CustomerID: C0002, Similarity Score: 0.9999999999999999
CustomerID: C0030, Similarity Score: 0.6586503065624391
CustomerID: C0173, Similarity Score: 0.45024031937925674
Recommendations saved to Ansh_Porwal_Lookalike.csv
