# Trial Similarity Matrix Using the March 2024 Dataset

## Preliminaries

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

### Loading the Dataset

In [2]:
file_path = '../Data/Product Flags/cust_info_prod_202403_processed.parquet'
march_2024 = pd.read_parquet(file_path)
march_2024

Unnamed: 0,CUST_NUM,IB,BUILDUP_F,CASH_MANAGEMENT,CORPORATE_LOANS,RETAIL_LOAN,TRADE_SERVICES,INVESTMENT_FUNDS,SECURITIES,INSURANCE,CORPORATE_FINANCE,FOREX,REMITTANCE,CORPORATE_CARDS,BB,DEPOSITS,RETAIL_PRODUCTS,INVESTMENTS_AND_SECURITIES
0,1357.977909,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
1,1437.499217,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
2,1606.444521,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
3,2522.626409,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
4,3201.700173,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298163,13943.279528,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
298164,13976.468939,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
298165,14002.523487,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
298166,14037.928765,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0


## Data Exploration

In [10]:
def get_info_and_summary(df):
    df_info = df.info()
    df_summary = df.describe()
    
    return df_info, df_summary

In [4]:
march_info, march_summary = get_info_and_summary(march_2024)
march_summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298168 entries, 0 to 298167
Data columns (total 18 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CUST_NUM                    298168 non-null  float64
 1   IB                          298168 non-null  float64
 2   BUILDUP_F                   298168 non-null  int64  
 3   CASH_MANAGEMENT             298168 non-null  int64  
 4   CORPORATE_LOANS             298168 non-null  int64  
 5   RETAIL_LOAN                 298168 non-null  int64  
 6   TRADE_SERVICES              298168 non-null  int64  
 7   INVESTMENT_FUNDS            298168 non-null  int64  
 8   SECURITIES                  298168 non-null  int64  
 9   INSURANCE                   298168 non-null  int64  
 10  CORPORATE_FINANCE           298168 non-null  int64  
 11  FOREX                       298168 non-null  int64  
 12  REMITTANCE                  298168 non-null  int64  
 13  CORPORATE_CARD

Unnamed: 0,CUST_NUM,IB,BUILDUP_F,CASH_MANAGEMENT,CORPORATE_LOANS,RETAIL_LOAN,TRADE_SERVICES,INVESTMENT_FUNDS,SECURITIES,INSURANCE,CORPORATE_FINANCE,FOREX,REMITTANCE,CORPORATE_CARDS,BB,DEPOSITS,RETAIL_PRODUCTS,INVESTMENTS_AND_SECURITIES
count,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0,298168.0
mean,9427.115676,1.393221,0.0,0.0,0.001147,0.000446,3.4e-05,0.0,0.0,0.075062,0.0,0.007345,0.0,2.7e-05,0.968213,0.998226,0.169488,0.023644
std,3333.481089,0.820836,0.0,0.0,0.033848,0.021115,0.005791,0.0,0.0,0.263491,0.0,0.085387,0.0,0.00518,0.175434,0.042084,0.375183,0.151939
min,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7070.257492,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
50%,9999.601142,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
75%,12247.079284,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,14142.078419,11.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


The following features are not owned by any customers:
- buildup_f
- cash_management
- retail_loan
- investment_funds
- securities
- corporate finance
- remittance

In [11]:
def count_ones_zeros(df):
    counts = {}
    for column in df.columns:
        if column != 'CUST_NUM' and df[column].nunique() == 2:  # Exclude CUST_NUM and ensure it's a binary column
            counts[column] = df[column].value_counts()
    
    return pd.DataFrame(counts).T.fillna(0).astype(int)

In [12]:
ones_zeros_count = count_ones_zeros(march_2024)
ones_zeros_count

Unnamed: 0,0,1
CORPORATE_LOANS,297826,342
RETAIL_LOAN,298035,133
TRADE_SERVICES,298158,10
INSURANCE,275787,22381
FOREX,295978,2190
CORPORATE_CARDS,298160,8
BB,9478,288690
DEPOSITS,529,297639
RETAIL_PRODUCTS,247632,50536
INVESTMENTS_AND_SECURITIES,291118,7050


## Sparse Matrix

In [6]:
columns_to_keep = ['CUST_NUM','CORPORATE_LOANS', 'RETAIL_LOAN', 'TRADE_SERVICES', 
                   'INSURANCE', 'FOREX', 'CORPORATE_CARDS', 'BB', 'DEPOSITS', 
                   'RETAIL_PRODUCTS', 'INVESTMENTS_AND_SECURITIES']

# Create a new DataFrame with only the specified columns
march_2024_sparse_matrix = march_2024[columns_to_keep]

march_2024_sparse_matrix

Unnamed: 0,CUST_NUM,CORPORATE_LOANS,RETAIL_LOAN,TRADE_SERVICES,INSURANCE,FOREX,CORPORATE_CARDS,BB,DEPOSITS,RETAIL_PRODUCTS,INVESTMENTS_AND_SECURITIES
0,1357.977909,0,0,0,0,0,0,1,1,0,0
1,1437.499217,0,0,0,0,0,0,1,1,0,0
2,1606.444521,0,0,0,0,0,0,1,1,0,0
3,2522.626409,0,0,0,0,0,0,1,1,0,0
4,3201.700173,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
298163,13943.279528,0,0,0,0,0,0,1,1,0,0
298164,13976.468939,0,0,0,0,0,0,1,1,0,0
298165,14002.523487,0,0,0,0,0,0,1,1,0,0
298166,14037.928765,0,0,0,0,0,0,1,1,0,0


## Similarity Matrix

In [13]:
def calculate_product_similarity(df, sample_size=20000, random_state=42):
    """
    Calculate the cosine similarity between products based on customer interactions.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing customer interactions.
    sample_size (int): The number of rows to sample for similarity calculation. Default is 20,000.
    random_state (int): Seed for reproducibility. Default is 42.

    Returns:
    pd.DataFrame: A DataFrame containing the cosine similarity between products.
    """
    # Sample the specified number of rows
    sampled_df = df.sample(n=sample_size, random_state=random_state)
    sampled_df_no_id = sampled_df.drop(columns=['CUST_NUM'])

    # Transpose the DataFrame so that products (columns) become rows
    sampled_df_transposed = sampled_df_no_id.T

    # Calculate cosine similarity between products
    similarity_matrix_products = cosine_similarity(sampled_df_transposed)

    # Convert the similarity matrix to a DataFrame with product names as both the index and columns
    similarity_df_products = pd.DataFrame(similarity_matrix_products, 
                                          index=sampled_df_no_id.columns, 
                                          columns=sampled_df_no_id.columns)

    return similarity_df_products

In [8]:
similarity_df_products = calculate_product_similarity(march_2024_sparse_matrix)
similarity_df_products

Unnamed: 0,CORPORATE_LOANS,RETAIL_LOAN,TRADE_SERVICES,INSURANCE,FOREX,CORPORATE_CARDS,BB,DEPOSITS,RETAIL_PRODUCTS,INVESTMENTS_AND_SECURITIES
CORPORATE_LOANS,1.0,0.0,0.0,0.093169,0.0,0.0,0.039351,0.038763,0.078026,0.016412
RETAIL_LOAN,0.0,1.0,0.0,0.027277,0.0,0.0,0.026882,0.02648,0.063962,0.024025
TRADE_SERVICES,0.0,0.0,1.0,0.036084,0.056254,0.0,0.01016,0.010009,0.024175,0.0
INSURANCE,0.093169,0.027277,0.036084,1.0,0.077136,0.0,0.28157,0.276822,0.319283,0.20299
FOREX,0.0,0.0,0.056254,0.077136,1.0,0.0,0.090306,0.088958,0.092478,0.096546
CORPORATE_CARDS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BB,0.039351,0.026882,0.01016,0.28157,0.090306,0.0,1.0,0.983843,0.420272,0.159843
DEPOSITS,0.038763,0.02648,0.010009,0.276822,0.088958,0.0,0.983843,1.0,0.411333,0.157137
RETAIL_PRODUCTS,0.078026,0.063962,0.024175,0.319283,0.092478,0.0,0.420272,0.411333,1.0,0.217442
INVESTMENTS_AND_SECURITIES,0.016412,0.024025,0.0,0.20299,0.096546,0.0,0.159843,0.157137,0.217442,1.0


## Recommendation Strength

In [9]:
def generate_recommendation_strength(df, similarity_df, random_state=42):
    """
    Generate a recommendation strength table for a random client based on product similarity.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing customer product usage (with 'CUST_NUM').
    similarity_df (pd.DataFrame): The product similarity matrix.
    random_state (int): Seed for reproducibility. Default is 42.

    Returns:
    pd.DataFrame: A DataFrame with the recommended products and their recommendation strength.
    """
    # Select a random client
    random_client = df.sample(n=1, random_state=random_state)
    client_id = random_client['CUST_NUM'].values[0]
    
    # Get the client's current products (where product usage is 1)
    client_products = random_client.drop(columns=['CUST_NUM']).iloc[0]
    current_products = client_products[client_products == 1].index.tolist()
    
    # Get the list of products the client is not currently using (where usage is 0)
    non_used_products = client_products[client_products == 0].index.tolist()
    
    # Initialize an empty dictionary to store recommendation strengths
    recommendation_strengths = {}
    
    # For each non-used product, calculate the recommendation strength
    for product in non_used_products:
        strength = similarity_df.loc[product, current_products].mean()  # Average similarity with the current products
        recommendation_strengths[product] = strength
    
    # Convert the dictionary into a DataFrame and sort by recommendation strength
    recommendation_df = pd.DataFrame(recommendation_strengths.items(), columns=['Product', 'Recommendation Strength'])
    recommendation_df = recommendation_df.sort_values(by='Recommendation Strength', ascending=False).reset_index(drop=True)
    
    return client_id, recommendation_df

# Example usage:
client_id, recommendation_df = generate_recommendation_strength(march_2024_sparse_matrix, similarity_df_products)

# Display the results
print(f"Recommendations for Client ID: {client_id}")
print(recommendation_df)


Recommendations for Client ID: 4606.847512128006
                      Product  Recommendation Strength
0                   INSURANCE                 0.292558
1  INVESTMENTS_AND_SECURITIES                 0.178141
2                       FOREX                 0.090581
3             CORPORATE_LOANS                 0.052046
4                 RETAIL_LOAN                 0.039108
5              TRADE_SERVICES                 0.014781
6             CORPORATE_CARDS                 0.000000


In [15]:
def generate_recommendation_strength_for_client(df, similarity_df, client_id):
    """
    Generate a recommendation strength table for a specified client based on product similarity.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing customer product usage (with 'CUST_NUM').
    similarity_df (pd.DataFrame): The product similarity matrix.
    client_id (int or str): The ID of the client to generate recommendations for.

    Returns:
    pd.DataFrame: A DataFrame with the recommended products and their recommendation strength.
    """
    # Get the client's row based on the provided client ID
    client_data = df[df['CUST_NUM'] == client_id]
    
    # If the client is not found, raise an error
    if client_data.empty:
        raise ValueError(f"Client ID {client_id} not found in the dataset.")
    
    # Get the client's current products (where product usage is 1)
    client_products = client_data.drop(columns=['CUST_NUM']).iloc[0]
    current_products = client_products[client_products == 1].index.tolist()
    
    # Get the list of products the client is not currently using (where usage is 0)
    non_used_products = client_products[client_products == 0].index.tolist()
    
    # Initialize an empty dictionary to store recommendation strengths
    recommendation_strengths = {}
    
    # For each non-used product, calculate the recommendation strength
    for product in non_used_products:
        strength = similarity_df.loc[product, current_products].mean()  # Average similarity with the current products
        recommendation_strengths[product] = strength
    
    # Convert the dictionary into a DataFrame and sort by recommendation strength
    recommendation_df = pd.DataFrame(recommendation_strengths.items(), columns=['Product', 'Recommendation Strength'])
    recommendation_df = recommendation_df.sort_values(by='Recommendation Strength', ascending=False).reset_index(drop=True)
    
    return recommendation_df

# Example usage:

random_client = march_2024.sample(n=1, random_state=42)
client_id = random_client['CUST_NUM'].values[0]
recommendation_df = generate_recommendation_strength_for_client(march_2024_sparse_matrix, similarity_df_products, client_id)

# Display the results
print(f"Recommendations for Client ID: {client_id}")
print(recommendation_df)


Recommendations for Client ID: 4606.847512128006
                      Product  Recommendation Strength
0                   INSURANCE                 0.292558
1  INVESTMENTS_AND_SECURITIES                 0.178141
2                       FOREX                 0.090581
3             CORPORATE_LOANS                 0.052046
4                 RETAIL_LOAN                 0.039108
5              TRADE_SERVICES                 0.014781
6             CORPORATE_CARDS                 0.000000
