In [1]:
# Primary Libraries
import ast
import re
from fuzzywuzzy import process
from time import time
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import math
import dataframe_image as dfi

# http://shakedzy.xyz/dython/modules/nominal/
from dython.nominal import associations
import scipy.stats as ss

# Libraries for ML Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, PolynomialFeatures
from sklearn.decomposition import PCA

# Libraries for Binary Classification
import category_encoders as ce
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, cross_val_score, LearningCurveDisplay
from sklearn.metrics import classification_report, make_scorer, precision_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score, RocCurveDisplay, precision_recall_curve
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost.sklearn import XGBClassifier

# Libraries for Recommender System
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Libraries for Customer Segmentation
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, silhouette_visualizer
from kmodes.kprototypes import KPrototypes
from kneed import KneeLocator

import joblib
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 300)
sns.set_theme(style="whitegrid", palette="pastel")

  from pandas import MultiIndex, Int64Index


**Project Structure:**
<ul>
<li>04.01: Introduction</li>
<li>04.02: Preprocessing</li>

<li>04.03: User-based Collaborative Filtering | KNN-based Model
<ul>
<li>04.03A: Preprocessing for KNN-based Model</li>
<li>04.03B: Creating KNN-based Model</li>
<li>04.03C: Testing the KNN-based Model</li>
</ul>
</li>

<li>04.04: User-based Collaborative Filtering | Pearson Correlation-based Model
<ul>
<li>04.04A: Preprocessing for Pearson Correlation-based Model</li>
<li>04.04B: Creating Pearson Correlation-based Model</li>
<li>04.04C: Testing the Pearson Correlation-based Model</li>
</ul>
</li>
    
</ul>

## 04: Recommender System

#### 04.01: Introduction
The goal of the recommender system is to create a recommendation engine for new customers with similar characteristics. 

The models that will be built are based on **user-based collaborative filtering**. These models provide insurance product recommendations for a target customer by assessing their similarity to other customers who have prior experience with the company. Despite the absence of explicit rating data, historical purchase data of customers can be leveraged to generate ratings, essentially generating ratings from implicit data.

**Dataset Description:**
1. 'Customer ID': Customer unique ID
2. 'City Code': The code of the City
3. 'Accomodation Ownership': Accomodation ownership type
4. 'Married_Insured': Column combined from 'Married' and 'Insurance Type' columns
5. 'Plan Code': Unique product code per insurance product
6. 'Policy Duration': Duration of active policy
7. 'Policy Type': Type of insurance policy (e.g there could be someone who buys an insurance policy only or an insurance policy plus a health rider)
8. 'Policy Category': Grouping category for the policy type (e.g. polis category A adalah polis tipe Health and Jiwa).
9. 'isLoyal' : Customer who has spent more than a year purchasing our insurance products
10. 'prodScore' : Products purchased by customers and their corresponding scores assigned by customers 

**The models to be built are as follows:**
1. K-Nearest Neighbors (KNN) based on Cosine Distance
2. Pearson Correlation based model

<hr>

**Notes for 'Married_Insured' column:**
1. Variable 1 = ('Married': 'No') & ('Insurance Type': 'Joint'), assumption: Single customers who purchased insurance for themselves and another family member or friend.
2. Variable 2 = ('Married': 'No') & ('Insurance Type': 'Personal'), assumption: Single customers who purchased insurance for themselves.
3. Variable 3 = ('Married': 'Yes') & ('Insurance Type': 'Joint'), assumption: Married customers who purchased insurance for themselves and their partners.*

**Notes for 'prodScore' column:**
Each product starts with a score of 0, then deducts 1.5 for the first response (the most recent response) of '0,' and deducts 1 for subsequent responses. It adds 1.5 for the first response (the most recent response) of '1' and adds 1 for subsequent responses.

#### 04.02: Preprocessing

In [2]:
# Establishing constants
RANDOM_STATE = 2023

In [3]:
# Importing the dataset
raw_df = pd.read_csv('data-cleaned.csv')
df = raw_df.copy()
df.drop(columns=['Unnamed: 0'], inplace=True)

# Categorizing columns 
list_str = ['Customer ID', 'City Code', 'Accomodation Ownership', 'Married_Insured', 'Plan Code', 'Policy Duration', 'Policy Type', 'Policy Category', 'isLoyal', 'prodScore']
list_float = ['Premium Amount', 'premium_lifetime', 'premium_LTYes', 'premium_LTNo']
list_int = [i for i in df.columns if (i not in list_str) and (i not in list_float)]

# Changing data types
for i in df.columns:
    if i in list_str:
        df[i] = df[i].astype('str')
    elif i in list_float:
        df[i] = df[i].astype('float')
    else:
        df[i] = df[i].astype('int32')
df['prodScore'] = df['prodScore'].apply(ast.literal_eval)
df.head()

Unnamed: 0,Customer ID,City Code,Accomodation Ownership,Age,Married_Insured,Plan Code,Policy Duration,Policy Type,Policy Category,Premium Amount,Response,customer_lifetime,isLoyal,premium_lifetime,purchase_frequency,prodScore,premium_LTYes,premium_LTNo,numProdTried,impression
0,81716,C2,Rented,75,2,X2,4.0,3,21,44689.0,0,0,0,92908.0,2,"{'X2': -1.5, 'X1': -1.5}",0.0,92908.0,2,-3
1,80492,C13,Owned,75,2,X1,6.0,3,18,15141.0,1,0,0,15141.0,1,{'X1': 1.5},15141.0,0.0,1,1
2,75895,C14,Owned,75,2,X2,1.0,3,8,33206.0,1,52,1,76905.0,2,{'X2': 0.5},33206.0,43699.0,1,0
3,85258,C4,Owned,75,2,X1,8.0,2,20,31657.0,0,50,1,98867.0,3,"{'X1': -1.5, 'X2': 0.5}",30585.0,68282.0,2,-1
4,80292,C3,Owned,75,2,X1,2.0,2,16,37167.0,0,0,0,37167.0,1,{'X1': -1.5},0.0,37167.0,1,-1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15188 entries, 0 to 15187
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             15188 non-null  object 
 1   City Code               15188 non-null  object 
 2   Accomodation Ownership  15188 non-null  object 
 3   Age                     15188 non-null  int32  
 4   Married_Insured         15188 non-null  object 
 5   Plan Code               15188 non-null  object 
 6   Policy Duration         15188 non-null  object 
 7   Policy Type             15188 non-null  object 
 8   Policy Category         15188 non-null  object 
 9   Premium Amount          15188 non-null  float64
 10  Response                15188 non-null  int32  
 11  customer_lifetime       15188 non-null  int32  
 12  isLoyal                 15188 non-null  object 
 13  premium_lifetime        15188 non-null  float64
 14  purchase_frequency      15188 non-null

#### 04.03: User-based Collaborative Filtering | KNN-based Model

> 04.03A: Preprocessing for KNN-based Model

In [5]:
# Creating a list of customers who have tried the insurance product only once
custIdx_triedOnce = df[df['numProdTried']==1].index

# Selecting only customers who have tried the insurance product 'more than' once
# as these customers can provide recommendations to other similar customers who have only purchased 1 product in the past
rsdf1 = df.drop(index=custIdx_triedOnce)

In [6]:
rsdf1.head()

Unnamed: 0,Customer ID,City Code,Accomodation Ownership,Age,Married_Insured,Plan Code,Policy Duration,Policy Type,Policy Category,Premium Amount,Response,customer_lifetime,isLoyal,premium_lifetime,purchase_frequency,prodScore,premium_LTYes,premium_LTNo,numProdTried,impression
0,81716,C2,Rented,75,2,X2,4.0,3,21,44689.0,0,0,0,92908.0,2,"{'X2': -1.5, 'X1': -1.5}",0.0,92908.0,2,-3
3,85258,C4,Owned,75,2,X1,8.0,2,20,31657.0,0,50,1,98867.0,3,"{'X1': -1.5, 'X2': 0.5}",30585.0,68282.0,2,-1
7,84172,C4,Owned,75,2,X1,3.0,1,21,16071.0,0,5,1,32886.0,2,"{'X1': -1.5, 'X4': 1.5}",16815.0,16071.0,2,0
10,75746,C4,Owned,75,2,X1,4.0,3,14,16319.0,1,0,0,50529.0,2,"{'X1': 1.5, 'X5': 1.5}",50529.0,0.0,2,3
12,76482,C1,Owned,75,2,X2,4.0,2,17,29452.0,0,45,1,131899.0,4,"{'X2': -3.5, 'X1': 1.5}",34307.0,97592.0,2,-2


In [7]:
# Creating the MN matrix, with M representing rows for customers and N representing columns for a list of unique insurance products ('Plan Code' items).
custIdxAbove1 = rsdf1['Customer ID'].values
sortedPlanCode = np.sort(rsdf1['Plan Code'].unique())
matrixZeros = np.zeros((rsdf1.shape[0], len(sortedPlanCode)))
custScoring = pd.DataFrame(matrixZeros, index=custIdxAbove1, columns=sortedPlanCode)

In [8]:
# Mapping the 'prodScore', insurance products customers have tried and also the ratings to the 'custScoring' data frame
for i in custIdxAbove1:
    dictUserProd = rsdf1[rsdf1['Customer ID']==i]['prodScore'].values[0]
    for prod, rating in dictUserProd.items():
        custScoring.loc[i, prod] = rating
        
# Dropping duplicate data
custScoring.drop_duplicates(inplace=True)

# Selecting only data with positive ratings, as there is no point in recommending insurance products that other customers dislike
custScoring = custScoring[(custScoring['X1']>=0) & (custScoring['X2']>=0) 
            & (custScoring['X3']>=0) & (custScoring['X4']>=0) 
            & (custScoring['X5']>=0) & (custScoring['X6']>=0) 
            & (custScoring['X7']>=0) & (custScoring['X8']>=0) 
            & (custScoring['X9']>=0)]
# custScoring.head()  # This matrix of customer product scoring has not been standardized

# Replacing the zeros in the matrix with 'NaN' values to create a standardized version of 'custScoring' at a later time
custScoringNan = pd.DataFrame(np.where(custScoring==0, np.nan, custScoring), columns=custScoring.columns, index=custScoring.index)

# Standardizing the customers' ratings using their mean to mitigate variations in rating styles, such as customers who are 'tough critics' and those who are more generous in giving product ratings
custScoringNanStd = custScoringNan.subtract(custScoringNan.mean(axis=1), axis=0)

# Filling the 'Nan' values back with zeros
custScoringStd = custScoringNanStd.fillna(0)

# custScoringNan.head()  # Checking
# custScoringNanStd.head()  # Checking

In [9]:
kncust = []
knprod = []
knrate = []
for i in custScoring.index:
    for j in custScoring.columns:
        vals = custScoring.loc[i, j]
        if vals != 0:
            kncust.append(i)
            knprod.append(j)
            knrate.append(vals)
mat_flatten = np.array([kncust, knprod, knrate]).T
flattenCustScoring = pd.DataFrame(mat_flatten, columns=['Customer ID', 'Plan Code', 'rating'])

# Creating a 'Plan Code' encoding mapper
dictMapper = {
    'X1': 0,
    'X2': 1,
    'X3': 2,
    'X4': 3,
    'X5': 4,
    'X6': 5,
    'X7': 6,
    'X8': 7,
    'X9': 8,
}

# Creating a reverse version of the 'Plan Code' encoding mapper
dictMapperRev = dict(zip(dictMapper.values(), dictMapper.keys()))

# Encoding the 'Plan Code'
flattenCustScoring['Plan Code'] = flattenCustScoring['Plan Code'].map(dictMapper)
# flattenCustScoring  # Checking

# CustScoringPivoted = flattenCustScoring.pivot_table(index='Customer ID', columns='Plan Code', values='rating').fillna(0)  # tes 01
CustScoringPivoted = flattenCustScoring.pivot_table(index='Plan Code', columns='Customer ID', values='rating').fillna(0)  # tes 02
# CustScoringPivoted  # checking

# Creating a compressed sparse matrix to reduce the dimensionality or sparsity of the data, even though it is not necessary for this dataset
minCustScoring = csr_matrix(CustScoringPivoted.values)  # Creating compressed sparse matrix to reduce the dimensionality of the data
# print(minCustScoring)  # Checking

> 04.03B: Creating KNN-based Model 

In [10]:
def recSysKNN(tesProd: str):
    nn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=3)  # cosine (metric distance)
    nn.fit(minCustScoring)
    
    tesProd = dictMapper[tesProd]
    
    counter = 1
    distance, prod_knn = nn.kneighbors(CustScoringPivoted.loc[tesProd, :].values.reshape(1,-1),n_neighbors=3)
    for i, j in zip(distance[0], prod_knn[0]):
        if i == 0:
            print(f"Hi! If you've just bought {dictMapperRev[tesProd]} You might want to try these products: ")
        else:
            print(f"{counter}.: Insurance Product {dictMapperRev[j]} with distance {round(i,3)} to the target product ({dictMapperRev[tesProd]})")
            counter += 1

> 04.03C: Testing the KNN-based Model 

In [11]:
# Generating a randomly selected customer
np.random.seed(RANDOM_STATE)
SampleKNN = pd.Series(np.array(custScoring.index)).sample().values[0]
# SampleKNN  # checking

# Creating a function that will grab the most recent products the target customer has purchased
def lastProdBought(customer):
    lastPB = df[df['Customer ID']==customer]['Plan Code'].values[0]
    return lastPB

In [12]:
# Tester
recSysKNN(lastProdBought('72266'))  # Testing using 'Customer ID' as the argument
print()
recSysKNN('X2')  # Testing using 'Plan Code' as the argument

Hi! If you've just bought X1 You might want to try these products: 
1.: Insurance Product X4 with distance 0.55 to the target product (X1)
2.: Insurance Product X2 with distance 0.619 to the target product (X1)

Hi! If you've just bought X2 You might want to try these products: 
1.: Insurance Product X1 with distance 0.619 to the target product (X2)
2.: Insurance Product X4 with distance 0.684 to the target product (X2)


#### 04.04: User-based Collaborative Filtering | Pearson Correlation-based Model

> 04.04A: Preprocessing for Pearson Correlation-based Model

In [13]:
# Creating association among customers using 'Pearson Correlation'
USimilarPearson = custScoringNanStd.T.corr()  # Creating a correlation matrix for similar customers (users)

# Removing customer who have no correlation with others
custCorrIdx = USimilarPearson.notna().sum().index
custCorrVal = USimilarPearson.notna().sum().values
custCorrIdxVal = list(zip(custCorrIdx, custCorrVal))

custCorrIdx_noCorr = [i for i,j in custCorrIdxVal if j==0]
custCorrIdx_hasCorr = [i for i in custCorrIdx if i not in custCorrIdx_noCorr]

# Dropping them
USimilarPearson.drop(index=custCorrIdx_noCorr, columns=custCorrIdx_noCorr, inplace=True)

> 04.04B: Creating Pearson Correlation-based Model 

In [14]:
def recSysPearson(randomCustTest: str, numSimilar:int, similarityThreshold:float):
    '''
    randomCustTest = Target Customer Index --> string
    numSimilar = Number of Similar Customers --> integer
    similarityThreshold = The threshold that determines whether random customers are indeed similar to the target customer --> float [0-1]
    '''
    temp_df = USimilarPearson.drop(index=randomCustTest).copy()  # Removing the target customer from correlation
    SimilarCustPearson_idx = temp_df[temp_df>similarityThreshold][randomCustTest].dropna().sort_values(ascending=False)[:numSimilar].index.values.tolist()  
    SimilarCustPearson_val = temp_df[temp_df>similarityThreshold][randomCustTest].dropna().sort_values(ascending=False)[:numSimilar].values.tolist()  
    CustPearson_idxVal = dict(zip(SimilarCustPearson_idx, SimilarCustPearson_val))
    prodHaveTried = custScoringNanStd[custScoringNanStd.index==randomCustTest].dropna(axis=1, how='all').columns.values
    
    # POPULAR PRODUCTS
    prodFrequency = [(i, custScoringNan[i].notna().sum()) for i in custScoringNan.columns]
    prodFrequency = sorted(prodFrequency, key=lambda x: x[1], reverse=True)  # The number of times these products have been purchased by satisfied customers
    numRecProdGlobal = 2
    recProdGlobal = []
    for i,j in prodFrequency:
        if i not in prodHaveTried:
            recProdGlobal.append(i)
    recProdGlobal_targetCust = recProdGlobal[:numRecProdGlobal]
    
    # Products that similar customers have tried
    df_simProdPearson = custScoringNanStd[custScoringNanStd.index.isin(SimilarCustPearson_idx)]

    # Dropping the insurance products target customer has tried, 
    # dropping Nan values based on cols for products that have never been tried by the similar customers, 
    # dropping Nan values based on rows for products that never been tried by similar customers
    df_recProdPearson = df_simProdPearson.drop(columns=prodHaveTried).dropna(axis=1, how='all').dropna(axis=0, how='all')
    dict_scoresStd = {}
    dict_scoresPred = {}
        
    # RECOMMENDING FROM POPULAR PRODUCTS
    # If there are similar customers
    if (len(CustPearson_idxVal) != 0):
        
        # # Debugging
        # print(f"Customer '{randomCustTest}' is similar to '{numSimilar}' other customers, which are: {SimilarCustPearson_idx}")  # checking
        # print(f"Processing ...")  # checking
        # print()  # for checking
        # print()  # for checking
        # print(f"DEBUG: SimilarCustPearson_idx: {SimilarCustPearson_idx}")  # for checking
        # print(f"DEBUG: SimilarCustPearson_val: {SimilarCustPearson_val}")  # for checking
        # print(f"DEBUG: CustPearson_idxVal: {CustPearson_idxVal}")  # for checking
        # print(f"DEBUG: len(CustPearson_idxVal): {len(CustPearson_idxVal)}")  # for checking
        # print(f"DEBUG: CustPearson_idxVal: {CustPearson_idxVal}")  # for checking     
        # print(f"df_recProdPearson: {df_recProdPearson}")  # checking
        # display(df_simProdPearson.drop(columns=prodHaveTried))  # checking

        for prod in df_recProdPearson.columns:
            scores_ = []
            for cust in df_recProdPearson.index:
                if pd.notna(df_recProdPearson.loc[cust, prod]) == True:  # filter not na
                    score = df_recProdPearson.loc[cust, prod] * CustPearson_idxVal[cust]
                    scores_.append(score)
                    # print(f"This is the score bro: {score} for product: {prod} and user: {cust}; AND THIS IS THE PEARSON CORR: {CustPearson_idxVal[cust]}")  # for checking
            dict_scoresStd[prod] = round(np.mean(scores_), 3)
            dict_scoresPred[prod] = round((np.mean(scores_) + custScoringNan.loc[randomCustTest].mean()), 3)
            # print(dict_scoresStd)  # for checking
            # print(dict_scoresPred)  # for checking

        df_scoresStd = pd.DataFrame(list(dict_scoresStd.items()), columns=['recProducts', 'ScoresStandardized'])
        df_scoresPred = pd.DataFrame(list(dict_scoresPred.items()), columns=['recProducts', 'PredictedRatings'])
        df_fromPearson = pd.merge(left=df_scoresStd, right=df_scoresPred, on='recProducts')
        
        # RECOMMENDING FROM SIMILAR CUSTOMERS
        # If there are products from similar customers that the target customer has not yet tried
        if len(df_fromPearson) != 0:
            print(f"Hi {randomCustTest}! You might want to try these products: {list(dict_scoresPred.keys())}")
            return df_fromPearson
        
        # RECOMMENDING FROM POPULAR PRODUCTS
        # If the target customer has already tried all of the products recommended by similar customers
        else:
            print(f"Hi {randomCustTest}! You might want to try these products: {recProdGlobal_targetCust}")
            return recProdGlobal_targetCust
      
           
    # RECOMMENDING FROM POPULAR PRODUCTS
    # If there are no similar enough customers
    else:
        print(f"Hi {randomCustTest}! You might want to try these products: {recProdGlobal_targetCust}")
        return recProdGlobal_targetCust

> 04.04C: Testing the Pearson Correlation-based Model 

In [15]:
# A randomly selected customer who is to receive a recommendation
np.random.seed(RANDOM_STATE) # Setting up pseudo randomization
randomCustTest = USimilarPearson.sample().index.values[0]

# randomCustTest  # output: '87759', '83683', '87818', '84798', '81519', '77874'

In [16]:
recSysPearson(randomCustTest, 3, 0.5)  # randomCustTest, similar to 3 other customers, 0.5 threshold for Pearson correlation

Hi 87759! You might want to try these products: ['X1']


Unnamed: 0,recProducts,ScoresStandardized,PredictedRatings
0,X1,0.0,1.0
