In [70]:
# Packages
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

# from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_score, recall_score

from lightgbm import LGBMClassifier

from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE

In [71]:
# Print more rows and columns of pandas.DataFrame
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

### **<font color='green'>Version of some libraries</font>**

In [72]:
# I am using the lateset sklearn version
import sklearn
sklearn.__version__

'0.24.1'

In [73]:
import imblearn
imblearn.__version__

'0.8.0'

In [74]:
import xgboost
xgboost.__version__

'1.4.1'

In [75]:
# Change path if needed
path = r'C:\Users\user\Desktop\KUL - Mstat\Big Data Platforms and Technologies\project'
data = pd.read_csv(path + r'\ctrain.csv')
data_test = pd.read_csv(path + r'\ctest.csv')

### **<font color='blue'>Some transformations</font>**
- <font color='blue'>Some transformations are not made at the stage of data cleaning for the sake of exploratory data analysis. Hence, they are done in this section.</font>
- <font color='blue'>They are mainly age-related variables.</font>

In [76]:
# Weight of evidence & Information value
def get_information_value(data, features):
    # cross tab
    tab = pd.crosstab(data[features], data['fraud'])
    # weight of evidence
    tab['all'] = tab[['Y', 'N']].sum(axis = 1) 
    tab['share'] = tab['all'] / tab['all'].sum(axis = 0)
    tab['Y_rate'] = tab['Y'] / tab['all']
    tab['N_dist'] = tab['N'] / tab['N'].sum()
    tab['Y_dist'] = tab['Y'] / tab['Y'].sum()
    tab['WoE'] = np.log(tab['N_dist'] / tab['Y_dist'])
    tab = tab.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    # information value
    tab['IV'] = tab['WoE'] * (tab['N_dist'] - tab['Y_dist'])
    return tab[np.abs(tab['IV']) > 0.01].index.values # threshold 0.01

In [77]:
# apply get_information_value
claim_postal_code_list = get_information_value(data, 'claim_postal_code')
policy_holder_postal_code_list = get_information_value(data, 'policy_holder_postal_code')
driver_postal_code_list = get_information_value(data, 'driver_postal_code')
third_party_1_postal_code_list = get_information_value(data, 'third_party_1_postal_code')
third_party_2_postal_code_list = get_information_value(data, 'third_party_2_postal_code')
repair_postal_code_list = get_information_value(data, 'repair_postal_code')
claim_vehicle_brand_list = get_information_value(data, 'claim_vehicle_brand')
policy_coverage_type_list = get_information_value(data, 'policy_coverage_type')

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [78]:
def handle_age(value):
    # A simple program to discretize age
    if pd.isna(value):
        return 'unknown'
    else:
        if value <= 20:
            return '<=20'
#        elif value <= 30:
#            return '<=30'
        elif value <= 40:
            return '<=40'
#        elif value <= 50:
#            return '<=50'
        elif value <= 60:
            return '<=60'
 #       elif value <= 70:
 #           return '<=70'
        elif value <= 80:
            return '<=80'
        else:
            return '>80'

def handle_policy_coverage(value):
    # A simple program to discretize policy_coverage_1000
    if pd.isna(value):
        return 'unknown'
    else:
        if value <= 20:
            return '<=20'
        elif value <= 40:
            return '<=40'
        elif value <= 60:
            return '<=60'
        elif value <= 80:
            return '<=80'
        else:
            return '>80'

def handle_categorical_grouping(value, grouping_list):
        if value == 'unknown':
            return value
        elif value in grouping_list:
            return str(value)
        else:
            return 'other'
        
def transform(x_dataset):
        x_dataset['driver_age'] = x_dataset['driver_age'].apply(lambda x: handle_age(x))
        x_dataset['policy_holder_age'] = x_dataset['policy_holder_age'].apply(lambda x: handle_age(x))
        x_dataset['repair_age'] = x_dataset['repair_age'].apply(lambda x: handle_age(x))
        x_dataset['third_party_1_age'] = x_dataset['third_party_1_age'].apply(lambda x: handle_age(x))
        x_dataset['third_party_2_age'] = x_dataset['third_party_2_age'].apply(lambda x: handle_age(x))
        x_dataset['third_party_3_age'] = x_dataset['third_party_3_age'].apply(lambda x: handle_age(x))
        
        x_dataset['policy_coverage_1000'] = x_dataset['policy_coverage_1000'].apply(lambda x: handle_policy_coverage(x))
        
        x_dataset['claim_postal_code'] = x_dataset['claim_postal_code'].apply(lambda x: handle_categorical_grouping(x, claim_postal_code_list))
        x_dataset['policy_holder_postal_code'] = x_dataset['policy_holder_postal_code'].apply(lambda x: handle_categorical_grouping(x, policy_holder_postal_code_list))
        x_dataset['driver_postal_code'] = x_dataset['driver_postal_code'].apply(lambda x: handle_categorical_grouping(x, driver_postal_code_list))
        x_dataset['third_party_1_postal_code'] = x_dataset['third_party_1_postal_code'].apply(lambda x: handle_categorical_grouping(x, third_party_1_postal_code_list))
        x_dataset['third_party_2_postal_code'] = x_dataset['third_party_2_postal_code'].apply(lambda x: handle_categorical_grouping(x, third_party_2_postal_code_list))
        x_dataset['third_party_3_postal_code'] = x_dataset['third_party_3_postal_code'].apply(lambda x: x if x == 'unknown' else 'other')
        x_dataset['repair_postal_code'] = x_dataset['repair_postal_code'].apply(lambda x: handle_categorical_grouping(x, repair_postal_code_list))
        # x_dataset['claim_vehicle_brand'] = x_dataset['claim_vehicle_brand'].apply(lambda x: handle_categorical_grouping(x, claim_vehicle_brand_list))
        x_dataset['policy_coverage_type'] = x_dataset['policy_coverage_type'].apply(lambda x: handle_categorical_grouping(x, policy_coverage_type_list))        
        
        return x_dataset.drop(['third_party_1_id_known', 'third_party_2_id_known', 'third_party_3_id_known'], axis = 1)

In [79]:
# Some transformation
data = transform(data)
data_test = transform(data_test)

# Sanity check: fraud, claim_amount only
set(data.columns) - set(data_test.columns)

{'claim_amount', 'fraud'}

In [80]:
# Create dataset
X, y = pd.get_dummies(data.drop(['claim_id', 'fraud'], axis = 1), drop_first = True), data['fraud'].apply(lambda x: 1 if x == 'Y' else 0)
X_test = pd.get_dummies(data_test.drop(['claim_id'], axis = 1))
cv = 5

### **<font color='blue'>Random forest model with cross validation</font>**
- <font color='blue'>Based on the results of cross validation, it is found that kNN = 7 for SMOTE, 500 trees and 90 number of features are the best hyperparameters in terms of weighted precision.</font>

In [117]:
# Stratified cross-validation for imbalanced dataset
skf = StratifiedKFold(n_splits = cv)

def gridcv(X, y, k_neighbors, max_features, n_estimators):
    precision_weighted_score = []
    recall_weighted_score = []
    total_sum = np.array([])
    predicted_prob = np.array([])
    true_lbl = np.array([])
    for train_index, test_index in skf.split(X, y):
        w, Xc = X[['claim_amount']], X.drop(['claim_amount'], axis = 1)
        
        # train-test split
        w_train, Xc_train, y_train = w.iloc[train_index], Xc.iloc[train_index], y[train_index]
        w_test, Xc_test, y_test = w.iloc[test_index], Xc.iloc[test_index], y[test_index]
        
        # pipeline
        pipe = Pipeline([('imputer', SimpleImputer(strategy = 'median')), 
                         ('upsampling', SMOTE(random_state = 99, k_neighbors = k_neighbors)),
                         ('classifier',  RandomForestClassifier(random_state = 99, max_features = max_features, 
                                                                n_estimators = n_estimators, 
                                                                n_jobs = -1))])
        pipe.fit(Xc_train, y_train)
        calibrated_pipe = CalibratedClassifierCV(base_estimator = pipe, cv = 3, n_jobs = -1)
        calibrated_pipe.fit(Xc_train, y_train)
        y_pred = calibrated_pipe.predict_proba(Xc_test)[:, 1]
        y_predt = calibrated_pipe.predict(Xc_test)
        
        precision_weighted_score.append(precision_score(y_test, y_predt, sample_weight = w_test.to_numpy().ravel()))
        recall_weighted_score.append(recall_score(y_test, y_predt, sample_weight = w_test.to_numpy().ravel()))
        total_sum = np.concatenate((w_test.to_numpy().ravel(), total_sum))
        predicted_prob = np.concatenate((y_pred, predicted_prob))
        true_lbl = np.concatenate((y_test, true_lbl))
        
    max_ind = np.argsort(-predicted_prob)[:100]
    return {'k_neighbors': k_neighbors, 'max_features': max_features, 'n_estimators': n_estimators, 
            'cv_precision_score': np.mean(precision_weighted_score), 'cv_recall_score': np.mean(recall_weighted_score),
           'sum_top100': np.dot(true_lbl[max_ind], total_sum[max_ind])}
        
# hyperparameter
smote_knn_param = [3, 5, 7, 9]
rf_param_nfeatures = [10, 30, 50, 70]
i = 1
ttl = len(smote_knn_param) * len(rf_param_nfeatures)

for k in smote_knn_param:
    for m in rf_param_nfeatures:
        results = gridcv(X, y, k, m, 200)
        print(results)
        print(i/ttl)
        i += 1

{'k_neighbors': 3, 'max_features': 10, 'n_estimators': 200, 'cv_precision_score': 0.8372879472890933, 'cv_recall_score': 0.3072769942449564, 'sum_top100': 695579.64}
0.0625
{'k_neighbors': 3, 'max_features': 30, 'n_estimators': 200, 'cv_precision_score': 0.8413509369209983, 'cv_recall_score': 0.3302931544874211, 'sum_top100': 705090.85}
0.125
{'k_neighbors': 3, 'max_features': 50, 'n_estimators': 200, 'cv_precision_score': 0.818573724296256, 'cv_recall_score': 0.3031683008388451, 'sum_top100': 678342.48}
0.1875
{'k_neighbors': 3, 'max_features': 70, 'n_estimators': 200, 'cv_precision_score': 0.8177762450494497, 'cv_recall_score': 0.304540936321018, 'sum_top100': 658331.3300000001}
0.25
{'k_neighbors': 5, 'max_features': 10, 'n_estimators': 200, 'cv_precision_score': 0.8172075563503786, 'cv_recall_score': 0.29325611315768085, 'sum_top100': 650263.64}
0.3125
{'k_neighbors': 5, 'max_features': 30, 'n_estimators': 200, 'cv_precision_score': 0.8215184876117986, 'cv_recall_score': 0.32416342

### **<font color='blue'>Gradient boosting model with cross validation</font>**

In [103]:
# Stratified cross-validation for imbalanced dataset
skf = StratifiedKFold(n_splits = cv)

def gridcv_gb(X, y, k, nl, lr):
    precision_weighted_score = []
    recall_weighted_score = []
    total_sum = np.array([])
    predicted_prob = np.array([])
    true_lbl = np.array([])
    for train_index, test_index in skf.split(X, y):
        w, Xc = X[['claim_amount']], X.drop(['claim_amount'], axis = 1)
        
        # train-test split
        w_train, Xc_train, y_train = w.iloc[train_index], Xc.iloc[train_index], y[train_index]
        w_test, Xc_test, y_test = w.iloc[test_index], Xc.iloc[test_index], y[test_index]
        
        # pipeline
        pipe = Pipeline([('imputer', SimpleImputer(strategy = 'median')), 
                         ('upsampling', SMOTE(random_state = 99, k_neighbors = k)),
                         ('classifier',  LGBMClassifier(random_state = 99, num_leaves = nl, learning_rate = lr, max_depth = 7))])
        pipe.fit(Xc_train, y_train)
        calibrated_pipe = CalibratedClassifierCV(base_estimator = pipe, cv = 3, n_jobs = -1)
        calibrated_pipe.fit(Xc_train, y_train)
        y_pred = calibrated_pipe.predict_proba(Xc_test)[:, 1]
        y_predt = calibrated_pipe.predict(Xc_test)
        
        precision_weighted_score.append(precision_score(y_test, y_predt, sample_weight = w_test.to_numpy().ravel()))
        recall_weighted_score.append(recall_score(y_test, y_predt, sample_weight = w_test.to_numpy().ravel()))
        total_sum = np.concatenate((w_test.to_numpy().ravel(), total_sum))
        predicted_prob = np.concatenate((y_pred, predicted_prob))
        true_lbl = np.concatenate((y_test, true_lbl))
        
    max_ind = np.argsort(-predicted_prob)[:100]
    return {'k_neighbors': k, 'number of leaves': nl, 'learning rate': lr, 
            'cv_precision_score': np.mean(precision_weighted_score), 'cv_recall_score': np.mean(recall_weighted_score),
           'sum_top100': np.dot(true_lbl[max_ind], total_sum[max_ind])}
        
# hyperparameter
num_leaves = [10, 20, 30]
learning_rate = [0.05, 0.1, 0.2]
knn = [3, 5, 7]

i = 1
ttl = len(num_leaves) * len(learning_rate) * len(knn)

for nl in num_leaves:
    for lr in learning_rate:
        for k in knn:
            results = gridcv_gb(X, y, k, nl, lr)
            print(results)
            print(i/ttl)
            i += 1

{'k_neighbors': 3, 'number of leaves': 10, 'learning rate': 0.05, 'cv_precision_score': 0.7576120449139067, 'cv_recall_score': 0.33999376827068, 'sum_top100': 648385.4700000001}
0.037037037037037035
{'k_neighbors': 5, 'number of leaves': 10, 'learning rate': 0.05, 'cv_precision_score': 0.7875654193770003, 'cv_recall_score': 0.3526475787649548, 'sum_top100': 681458.95}
0.07407407407407407
{'k_neighbors': 7, 'number of leaves': 10, 'learning rate': 0.05, 'cv_precision_score': 0.7783172275985322, 'cv_recall_score': 0.32123263751772674, 'sum_top100': 647498.27}
0.1111111111111111
{'k_neighbors': 3, 'number of leaves': 10, 'learning rate': 0.1, 'cv_precision_score': 0.8517468958537935, 'cv_recall_score': 0.33659857782630276, 'sum_top100': 755360.23}
0.14814814814814814
{'k_neighbors': 5, 'number of leaves': 10, 'learning rate': 0.1, 'cv_precision_score': 0.8646423496415865, 'cv_recall_score': 0.3036309145812015, 'sum_top100': 743757.6799999999}
0.18518518518518517
{'k_neighbors': 7, 'number

### **<font color='blue'>Prediction</font>**

#### **<font color='blue'>Random forest</font>**

In [89]:
# Align features
X_train = X.drop(['claim_amount'], axis = 1)
X_test = X_test.reindex(columns = X_train.columns, fill_value = 0)

In [132]:
# Pipeline
pipe = Pipeline([('imputer', SimpleImputer(strategy = 'median', add_indicator = True)), 
                ('upsampling', SMOTE(random_state = 99, k_neighbors = 3)),
                ('classifier',  RandomForestClassifier(random_state = 99, max_features = 30, 
                                                        n_estimators = 500, 
                                                        n_jobs = -1))])
pipe.fit(X_train, y)
calibrated_pipe = CalibratedClassifierCV(base_estimator = pipe, cv = 3, n_jobs = -1)
calibrated_pipe.fit(X_train, y)

Pipeline(steps=[('imputer',
                 SimpleImputer(add_indicator=True, strategy='median')),
                ('upsampling', SMOTE(k_neighbors=3, random_state=99)),
                ('classifier',
                 RandomForestClassifier(max_features=30, n_estimators=500,
                                        n_jobs=-1, random_state=99))])

In [145]:
# Make prediction
y_pred = calibrated_pipe.predict_proba(X_test)

In [146]:
# Dataframe which contains the results
pred = pd.DataFrame()
pred['ID'] = data_test['claim_id']
pred['PROB'] = y_pred[:, 1]

In [148]:
# Export data
pred.to_csv(path + r'\results_randomforest_v2.csv', index = False)

---

#### **<font color='blue'>Light GBM</font>**

In [104]:
# Pipeline
pipe2 = Pipeline([('imputer', SimpleImputer(strategy = 'median', add_indicator = True)), 
                ('upsampling', SMOTE(random_state = 99, k_neighbors = 7)),
                ('classifier',   LGBMClassifier(random_state = 99, num_leaves = 20, learning_rate = 0.1, max_depth = 7))])
pipe2.fit(X_train, y)
calibrated_pipe2 = CalibratedClassifierCV(base_estimator = pipe2, cv = 3, n_jobs = -1)
calibrated_pipe2.fit(X_train, y)

CalibratedClassifierCV(base_estimator=Pipeline(steps=[('imputer',
                                                       SimpleImputer(add_indicator=True,
                                                                     strategy='median')),
                                                      ('upsampling',
                                                       SMOTE(k_neighbors=7,
                                                             random_state=99)),
                                                      ('classifier',
                                                       LGBMClassifier(max_depth=7,
                                                                      num_leaves=20,
                                                                      random_state=99))]),
                       cv=3, n_jobs=-1)

In [110]:
# Make prediction
y_pred = calibrated_pipe2.predict_proba(X_train)

In [106]:
# Dataframe which contains the results
pred = pd.DataFrame()
pred['ID'] = data_test['claim_id']
pred['PROB'] = y_pred[:, 1]

In [115]:
# Export data
pred.to_csv(path + r'\results_lightgbm_v2.csv', index = False)

---

In [114]:
pred[pred['PROB']>0.5]

Unnamed: 0,ID,PROB
840,66309,0.9582
2406,67875,0.652193
2409,67878,0.850049
3858,69327,0.81343
4113,69582,0.897359
4345,69814,0.955568
4519,69988,0.953728
4946,70415,0.856759
5423,70892,0.935269
6904,72374,0.960168
