In [26]:
import pandas as pd

# Load the datasets
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

# Check the first few rows of each dataset
print(customers_df.head())
print(transactions_df.head())
print(products_df.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  
  ProductID              ProductName     Category   Price
0      P001

In [27]:
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime

# Extract Tenure (number of days since signup)
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
customers_df['Tenure'] = (datetime.now() - customers_df['SignupDate']).dt.days

# One-hot encode the 'Region' column
encoder = OneHotEncoder(sparse_output=False)  # Corrected argument
region_encoded = encoder.fit_transform(customers_df[['Region']])
region_df = pd.DataFrame(region_encoded, columns=encoder.categories_[0])

# Combine customer features
customer_features = pd.concat([customers_df[['CustomerID', 'Tenure']], region_df], axis=1)

# Display processed customer data
print(customer_features.head())


  CustomerID  Tenure  Asia  Europe  North America  South America
0      C0001     932   0.0     0.0            0.0            1.0
1      C0002    1079   1.0     0.0            0.0            0.0
2      C0003     326   0.0     0.0            0.0            1.0
3      C0004     841   0.0     0.0            0.0            1.0
4      C0005     896   1.0     0.0            0.0            0.0


In [28]:
# Aggregate transaction data by CustomerID
transaction_summary = transactions_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'TransactionID': 'count'  # Total number of transactions
}).reset_index()

# Rename columns for clarity
transaction_summary.rename(columns={'TotalValue': 'TotalAmountSpent', 'TransactionID': 'TransactionCount'}, inplace=True)

# Merge transaction data with customer features
customer_data = pd.merge(customer_features, transaction_summary, on='CustomerID', how='left')

# Display the updated customer data
print(customer_data.head())


  CustomerID  Tenure  Asia  Europe  North America  South America  \
0      C0001     932   0.0     0.0            0.0            1.0   
1      C0002    1079   1.0     0.0            0.0            0.0   
2      C0003     326   0.0     0.0            0.0            1.0   
3      C0004     841   0.0     0.0            0.0            1.0   
4      C0005     896   1.0     0.0            0.0            0.0   

   TotalAmountSpent  TransactionCount  
0           3354.52               5.0  
1           1862.74               4.0  
2           2725.38               4.0  
3           5354.88               8.0  
4           2034.24               3.0  


In [29]:
# Merge the transactions with the product data to get product categories
product_category_summary = transactions_df.merge(products_df[['ProductID', 'Category']], on='ProductID', how='left')

# Aggregate the product category purchases by customer
product_category_counts = product_category_summary.groupby(['CustomerID', 'Category']).agg({
    'Quantity': 'sum'  # Total quantity purchased for each category
}).reset_index()

# Pivot the table to create one column for each product category
product_category_pivot = product_category_counts.pivot_table(
    index='CustomerID', columns='Category', values='Quantity', aggfunc='sum', fill_value=0)

# Merge product category data with the customer profile
customer_data = pd.merge(customer_data, product_category_pivot, on='CustomerID', how='left').fillna(0)

# Display the updated customer data
print(customer_data.head())


  CustomerID  Tenure  Asia  Europe  North America  South America  \
0      C0001     932   0.0     0.0            0.0            1.0   
1      C0002    1079   1.0     0.0            0.0            0.0   
2      C0003     326   0.0     0.0            0.0            1.0   
3      C0004     841   0.0     0.0            0.0            1.0   
4      C0005     896   1.0     0.0            0.0            0.0   

   TotalAmountSpent  TransactionCount  Books  Clothing  Electronics  \
0           3354.52               5.0    2.0       0.0          7.0   
1           1862.74               4.0    0.0       4.0          0.0   
2           2725.38               4.0    0.0       4.0          4.0   
3           5354.88               8.0    8.0       0.0          6.0   
4           2034.24               3.0    0.0       0.0          4.0   

   Home Decor  
0         3.0  
1         6.0  
2         6.0  
3         9.0  
4         3.0  


In [30]:
from sklearn.metrics.pairwise import cosine_similarity

# Extract feature vectors (excluding CustomerID)
customer_features = customer_data.drop(columns=['CustomerID'])

# Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(customer_features)

# Check the similarity of the first few customers
print(similarity_matrix[:5, :5])  # Similarity between the first 5 customers


[[1.         0.96790277 0.98847588 0.99336859 0.98966494]
 [0.96790277 1.         0.91871733 0.93259661 0.99393695]
 [0.98847588 0.91871733 1.         0.99932318 0.95655606]
 [0.99336859 0.93259661 0.99932318 1.         0.96661686]
 [0.98966494 0.99393695 0.95655606 0.96661686 1.        ]]


In [31]:
# Get the top 3 most similar customers for each customer
top_lookalikes = {}
for i in range(len(customer_data)):
    # Get similarity scores for customer i
    similarity_scores = similarity_matrix[i]
    
    # Exclude the customer itself by setting its similarity score to -1
    similarity_scores[i] = -1

    # Get the indices of the top 3 most similar customers
    top_indices = similarity_scores.argsort()[-3:][::-1]
    
    # Store the top 3 lookalikes and their similarity scores
    top_lookalikes[customer_data['CustomerID'][i]] = [
        (customer_data['CustomerID'][idx], similarity_scores[idx]) for idx in top_indices
    ]

# Display the top lookalikes for the first 5 customers
for customer, lookalikes in list(top_lookalikes.items())[:5]:
    print(f"Top 3 lookalikes for {customer}: {lookalikes}")


Top 3 lookalikes for C0001: [('C0174', 0.9999879436694386), ('C0135', 0.9999844934562775), ('C0139', 0.9999822895308264)]
Top 3 lookalikes for C0002: [('C0029', 0.9999664526490727), ('C0025', 0.9999661281475694), ('C0121', 0.9999567212133877)]
Top 3 lookalikes for C0003: [('C0148', 0.9999981615583643), ('C0021', 0.999996159357211), ('C0089', 0.9999948868486738)]
Top 3 lookalikes for C0004: [('C0167', 0.9999968080694094), ('C0142', 0.999994344111737), ('C0034', 0.999992794365829)]
Top 3 lookalikes for C0005: [('C0159', 0.9999921319550306), ('C0176', 0.9999782818287117), ('C0166', 0.9999756615037821)]


In [32]:
import pandas as pd

# Prepare the data for saving as Lookalike.csv
lookalike_data = []
for customer, lookalikes in top_lookalikes.items():
    for lookalike, score in lookalikes:
        lookalike_data.append([customer, lookalike, score])

# Create a DataFrame and save it to CSV
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv has been saved successfully.")


Lookalike.csv has been saved successfully.


# Task 2: Lookalike Model

## Objective
Build a Lookalike Model that recommends 3 similar customers for each of the first 20 customers based on their demographic profile and transaction history. The model uses both customer and product information to calculate similarity scores and generate personalized customer recommendations.

## Process

### 1. **Data Preprocessing**:
   - **Loaded and cleaned** the `Customers.csv`, `Transactions.csv`, and `Products.csv` datasets.
   - **Processed customer data** to create relevant features such as `Tenure` (days since signup) and one-hot encoded `Region`.
   - **Aggregated transaction data** to calculate customer metrics like `TotalAmountSpent` and `TransactionCount`.
   - **Integrated product data** to calculate the types of products purchased by each customer and aggregated them into product category features.

### 2. **Similarity Calculation**:
   - **Combined customer profile features**, transaction data, and product purchase patterns into a unified feature vector for each customer.
   - **Calculated the Cosine Similarity** between customers to measure their similarity based on their features.

### 3. **Lookalike Recommendations**:
   - For each of the first 20 customers, **identified the top 3 most similar customers** based on similarity scores.
   - **Created a mapping** of each customer to their top 3 lookalike customers with similarity scores.

## Deliverables
- **Lookalike.csv**: Contains the top 3 most similar customers for each of the first 20 customers, along with similarity scores.
- **Python Code**: The full implementation of the Lookalike Model and all relevant steps are provided in the Python script for reproducibility.

## Example Insights (Placeholder):
- Customers with similar purchasing behavior can be targeted with **similar marketing campaigns**.
- Certain **product categories** are preferred by specific customer segments, revealing opportunities for **personalized product recommendations**.
- **High-spending customers** can be grouped with lookalike customers for **retention strategies**.

## Next Steps
Use the generated lookalikes to:
- **Improve marketing efforts** by targeting lookalike customers with tailored campaigns.
- **Enhance product recommendations** based on similar customer behavior.
- **Build personalized customer experiences** leveraging the insights from the lookalike model.
