In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')



In [2]:

customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Convert dates to datetime
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

In [3]:
print(customers_df.head(5))
print(products_df.head(5))
print(transactions_df.head(5))

  CustomerID        CustomerName         Region SignupDate
0      C0001    Lawrence Carroll  South America 2022-07-10
1      C0002      Elizabeth Lutz           Asia 2022-02-13
2      C0003      Michael Rivera  South America 2024-03-07
3      C0004  Kathleen Rodriguez  South America 2022-10-09
4      C0005         Laura Weber           Asia 2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127      P067 2024-04-25 07:38:55         1   
3        T00272   

### Function: `create_customer_features`

This function generates a feature matrix for each customer by combining customer, transaction, and product data. Below is an explanation of how the function works:

1. **Basic Customer Features**:
   - Extracts `CustomerID` as a base feature.

2. **Region Encoding**:
   - Uses one-hot encoding to transform the `Region` column into numerical features for each region.

3. **Signup Recency**:
   - Calculates the number of days since a customer signed up using the reference date (`2024-12-31`).

4. **Transaction Features**:
   - Aggregates transaction data for each customer to calculate:
     - `TransactionCount`: Total number of transactions.
     - `TotalQuantity`: Total quantity of items purchased.
     - `AvgQuantity`: Average quantity per transaction.
     - `TotalSpend`: Total spending across all transactions.
     - `AvgSpend`: Average spending per transaction.
     - `AvgPrice`: Average price of purchased products.

5. **Category Preferences**:
   - Merges transaction data with product information to identify product categories.
   - Calculates total spending by category for each customer.
   - Converts category spending into percentages to show preferences.

6. **Feature Merging**:
   - Combines all generated features (basic, region, transaction, and category preferences) into a single feature matrix.
   - Handles missing data by filling `NaN` values with `0`.

7. **Return Value**:
   - Outputs a complete feature matrix for all customers, ready for further analysis.


- After generating the feature matrix:
  - It Uses `StandardScaler` to normalize the features for better comparison.
  - Computing a similarity matrix using cosine similarity to measure how similar customers are based on their behavior and preferences.


In [4]:
def create_customer_features(customers_df, transactions_df, products_df):
    """Create feature vector for each customer with NaN handling"""


    base_features = customers_df[['CustomerID']].copy()


    region_features = pd.get_dummies(customers_df['Region'], prefix='Region')


    max_date = pd.to_datetime('2024-12-31')
    base_features['DaysSinceSignup'] = (max_date - customers_df['SignupDate']).dt.days



    transaction_metrics = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'Quantity': ['sum', 'mean'],
        'TotalValue': ['sum', 'mean'],
        'Price': 'mean'
    }).fillna(0)


    transaction_metrics.columns = ['TransactionCount', 'TotalQuantity',
                                 'AvgQuantity', 'TotalSpend',
                                 'AvgSpend', 'AvgPrice']
    transaction_metrics = transaction_metrics.reset_index()


    trans_products = transactions_df.merge(products_df, on='ProductID')


    category_spend = trans_products.groupby(['CustomerID', 'Category'])['TotalValue'].sum().unstack(fill_value=0)


    category_totals = category_spend.sum(axis=1)
    category_percentages = category_spend.div(category_totals, axis=0).fillna(0)
    category_percentages.columns = [f'Pct_{col}' for col in category_percentages.columns]


    final_features = (base_features
                     .merge(region_features, left_index=True, right_index=True)
                     .merge(transaction_metrics, on='CustomerID', how='left')
                     .merge(category_percentages.reset_index(), on='CustomerID', how='left'))


    final_features = final_features.fillna(0)

    return final_features


feature_matrix = create_customer_features(customers_df, transactions_df, products_df)


customer_ids = feature_matrix['CustomerID']
features = feature_matrix.drop('CustomerID', axis=1)


scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

#
similarity_matrix = cosine_similarity(scaled_features)

### Function: `get_top_lookalikes`

This function identifies the top N most similar customers for a given customer based on a pre-computed similarity matrix. Below is the breakdown of its components:

1. **Parameters**:
   - `customer_id`: The ID of the customer for whom we want to find similar customers.
   - `n`: The number of similar customers to retrieve (default is 3).

2. **Steps**:
   - **Find the Customer Index**:
     - Locate the index of the given `customer_id` in the `customer_ids` list.
   - **Retrieve Similarities**:
     - Use the similarity matrix to fetch similarity scores for the given customer.
   - **Sort Similarities**:
     - Sort similarity scores in descending order and exclude the customer itself.
   - **Extract Top N Similar Customers**:
     - Retrieve the IDs and similarity scores of the top N most similar customers.

3. **Error Handling**:
   - If the `customer_id` is not found or any other issue occurs, the function prints an error message and returns `None`.

4. **Return Value**:
   - A list of dictionaries containing:
     - `similar_customer`: The ID of the similar customer.
     - `similarity_score`: The similarity score rounded to 4 decimal places.

---

### How to use Example:
1. **Generate Lookalikes for First 20 Customers**:
   - The code generates lookalike data for customers with IDs `C0001` to `C0020`.
   - For each target customer:
     - Calls `get_top_lookalikes` to get the top 3 similar customers.
   - The results include:
     - `CustomerID`: Target customer ID.
     - `Similar_Customer_1` to `Similar_Customer_3`: The IDs of the top 3 similar customers.
     - `Score_1` to `Score_3`: The similarity scores of the corresponding customers.

2. **Output**:
   - A DataFrame (`output_df`) is created from the results.
   - The DataFrame is will be saved to a CSV file named `Abhijeet_Singh_Lookalike.csv`.

---

### In Output File: `Lookalike.csv`
- This file contains the following columns:
  - `CustomerID`: Target customer ID.
  - `Similar_Customer_1`, `Similar_Customer_2`, `Similar_Customer_3`: The IDs of the most similar customers.
  - `Score_1`, `Score_2`, `Score_3`: The corresponding similarity scores.



In [5]:
def get_top_lookalikes(customer_id, n=3):
    """Get top N similar customers for given customer ID"""
    try:

        customer_idx = customer_ids[customer_ids == customer_id].index[0]


        similarities = similarity_matrix[customer_idx]


        similar_indices = np.argsort(similarities)[::-1][1:n+1]


        results = []
        for idx in similar_indices:
            results.append({
                'similar_customer': customer_ids.iloc[idx],
                'similarity_score': round(similarities[idx], 4)
            })

        return results
    except Exception as e:
        print(f"Error processing customer {customer_id}: {str(e)}")
        return None

# Generate lookalikes for first 20 customers (C0001-C0020)
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]
results = []

for cust_id in target_customers:
    lookalikes = get_top_lookalikes(cust_id)
    if lookalikes:
        results.append({
            'CustomerID': cust_id,
            'Similar_Customer_1': lookalikes[0]['similar_customer'],
            'Score_1': lookalikes[0]['similarity_score'],
            'Similar_Customer_2': lookalikes[1]['similar_customer'],
            'Score_2': lookalikes[1]['similarity_score'],
            'Similar_Customer_3': lookalikes[2]['similar_customer'],
            'Score_3': lookalikes[2]['similarity_score']
        })


output_df = pd.DataFrame(results)
output_df.to_csv('Lookalike.csv', index=False)


In [6]:


print("\nFirst few rows of lookalike recommendations:")
print(output_df.head())


print("\nFeature matrix shape:", features.shape)
print("\nNumber of customers processed:", len(results))


First few rows of lookalike recommendations:
  CustomerID Similar_Customer_1  Score_1 Similar_Customer_2  Score_2  \
0      C0001              C0192   0.8948              C0120   0.8907   
1      C0002              C0106   0.9303              C0159   0.9264   
2      C0003              C0091   0.7725              C0031   0.7589   
3      C0004              C0113   0.9203              C0165   0.8623   
4      C0005              C0007   0.9310              C0140   0.8262   

  Similar_Customer_3  Score_3  
0              C0112   0.7914  
1              C0134   0.8904  
2              C0195   0.7331  
3              C0104   0.8265  
4              C0186   0.7292  

Feature matrix shape: (200, 15)

Number of customers processed: 20
