In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Display the first few rows of each DataFrame to understand the structure
print("Customers DataFrame:")
print(customers_df.head())
print("\nProducts DataFrame:")
print(products_df.head())
print("\nTransactions DataFrame:")
print(transactions_df.head())

# Check for missing values
print("\nMissing values in Customers:\n", customers_df.isnull().sum())
print("\nMissing values in Products:\n", products_df.isnull().sum())
print("\nMissing values in Transactions:\n", transactions_df.isnull().sum())

# Data Cleaning
# Remove duplicates
customers_df.drop_duplicates(inplace=True)
products_df.drop_duplicates(inplace=True)
transactions_df.drop_duplicates(inplace=True)

# Check for duplicates
print("\nDuplicates in Customers:", customers_df.duplicated().sum())
print("Duplicates in Products:", products_df.duplicated().sum())
print("Duplicates in Transactions:", transactions_df.duplicated().sum())

# Convert date columns to datetime
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

# Feature Engineering
# Create features for customers based on transaction history
customer_features = transactions_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'TransactionDate': 'count',  # Number of transactions
    'ProductID': 'nunique'  # Unique products purchased
}).reset_index()

# Rename columns for clarity
customer_features.rename(columns={
    'TotalValue': 'Total_Spending',
    'TransactionDate': 'Transaction_Count',
    'ProductID': 'Unique_Products'
}, inplace=True)

# Add demographic features from customers_df
# Assuming customers_df has columns 'CustomerID', 'Age', 'Region', etc.
customer_features = customer_features.merge(customers_df[['CustomerID', 'Region']], on='CustomerID', how='left')

# Check the resulting customer features DataFrame
print("\nCustomer Features DataFrame:")
print(customer_features.head())

# Handle any missing values in the customer features
# For example, fill missing ages with the median age
if 'Age' in customer_features.columns:
    customer_features['Age'].fillna(customer_features['Age'].median(), inplace=True)

# Convert categorical variables to numerical (if necessary)
# For example, using one-hot encoding for the 'Region' column
if 'Region' in customer_features.columns:
    customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

# Final DataFrame for modeling
print("\nFinal Customer Features DataFrame:")
print(customer_features.head())

# Feature Engineering for Product Features
# Create product features based on transaction history
product_features = transactions_df.groupby('ProductID').agg({
    'TotalValue': 'sum',  # Total sales for each product
    'Quantity': 'sum',  # Total quantity sold for each product
    'TransactionDate': 'count'  # Number of transactions for each product
}).reset_index()

# Select relevant features for similarity calculation
# Exclude 'CustomerID' as it is not a feature
features = customer_features.drop(columns=['CustomerID'])

# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(features_scaled)

# Create a DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Display the similarity matrix
print("\nSimilarity Matrix:")
print(similarity_df.head())

# Function to get top N similar customers for a given customer
def get_top_n_similar_customers(customer_id, n=3):
    if customer_id not in similarity_df.index:
        return f"Customer ID {customer_id} not found."
    
    # Get the similarity scores for the given customer
    similar_customers = similarity_df[customer_id].nlargest(n + 1)  # Get top N + 1 (including self)
    similar_customers = similar_customers[similar_customers.index != customer_id]  # Exclude self
    return similar_customers

# Example: Get top 3 similar customers for the first 20 customers
lookalike_recommendations = {}
for customer_id in customer_features['CustomerID'][:20]:
    lookalike_recommendations[customer_id] = get_top_n_similar_customers(customer_id, n=3)

# Display the lookalike recommendations
print("\nLookalike Recommendations:")
for customer_id, similar_ids in lookalike_recommendations.items():
    print(f"Customer ID: {customer_id} -> Similar Customers: {similar_ids.index.tolist()} with Scores: {similar_ids.values.tolist()}")

    # Assuming lookalike_recommendations dictionary is already created
# Prepare data for the Lookalike.csv file
lookalike_results = []

for customer_id, similar_ids in lookalike_recommendations.items():
    for similar_id in similar_ids.index:
        score = similar_ids[similar_id]
        lookalike_results.append({'CustomerID': customer_id, 'LookalikeID': similar_id, 'SimilarityScore': score})

# Create a DataFrame from the results
lookalike_df = pd.DataFrame(lookalike_results)

# Save to Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the resulting DataFrame
print("\nLookalike Recommendations DataFrame:")
print(lookalike_df.head())

Customers DataFrame:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products DataFrame:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions DataFrame:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00