In [1]:
# Packages
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

# from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer

from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE

In [2]:
# Print more rows and columns of pandas.DataFrame
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

### **<font color='green'>Version of some libraries</font>**

In [3]:
# I am using the lateset sklearn version
import sklearn
sklearn.__version__

'0.24.1'

In [4]:
import imblearn
imblearn.__version__

'0.8.0'

In [5]:
import xgboost
xgboost.__version__

'1.4.1'

In [80]:
# Change path if needed
path = r'C:\Users\user\Desktop\KUL - Mstat\Big Data Platforms and Technologies\project'
data = pd.read_csv(path + r'\ctrain.csv')
data_test = pd.read_csv(path + r'\ctest.csv')

### **<font color='blue'>Some transformations</font>**
- <font color='blue'>Some transformations are not made at the stage of data cleaning for the sake of exploratory data analysis. Hence, they are done in this section.</font>
- <font color='blue'>They are mainly age-related variables.</font>

In [81]:
# Grouping categorical features based on training data - data
# Postal code
claim_postal_code_list = data[data['fraud'] == 'Y']['claim_postal_code'].value_counts().head(20).index.values
policy_holder_postal_code_list = data[data['fraud'] == 'Y']['policy_holder_postal_code'].value_counts().head(20).index.values
driver_postal_code_list = data[data['fraud'] == 'Y']['driver_postal_code'].value_counts().head(18).index.values
third_party_1_postal_code_list = data[data['fraud'] == 'Y']['third_party_1_postal_code'].value_counts().head(16).index.values
third_party_2_postal_code_list = data[data['fraud'] == 'Y']['third_party_2_postal_code'].value_counts().head(18).index.values
repair_postal_code_list = data[data['fraud'] == 'Y']['repair_postal_code'].value_counts().head(21).index.values

# Vehicle brand
claim_vehicle_brand_list = data[data['fraud'] == 'Y']['claim_vehicle_brand'].value_counts().head(22).index.values

# Policy coverage
policy_coverage_type_list = data[data['fraud'] == 'Y']['policy_coverage_type'].value_counts().head(21).index.values

In [82]:
def handle_age(value):
    # A simple program to discretize age
    if pd.isna(value):
        return 'unknown'
    else:
        if value <= 20:
            return '<=20'
#        elif value <= 30:
#            return '<=30'
        elif value <= 40:
            return '<=40'
#        elif value <= 50:
#            return '<=50'
        elif value <= 60:
            return '<=60'
 #       elif value <= 70:
 #           return '<=70'
        elif value <= 80:
            return '<=80'
        else:
            return '>80'

def handle_policy_coverage(value):
    # A simple program to discretize policy_coverage_1000
    if pd.isna(value):
        return 'unknown'
    else:
        if value <= 20:
            return '<=20'
        elif value <= 40:
            return '<=40'
        elif value <= 60:
            return '<=60'
        elif value <= 80:
            return '<=80'
        else:
            return '>80'

def handle_categorical_grouping(value, grouping_list):
        if value == 'unknown':
            return value
        elif value in grouping_list:
            return str(value)
        else:
            return 'other'
        
def transform(x_dataset):
        x_dataset['driver_age'] = x_dataset['driver_age'].apply(lambda x: handle_age(x))
        x_dataset['policy_holder_age'] = x_dataset['policy_holder_age'].apply(lambda x: handle_age(x))
        x_dataset['repair_age'] = x_dataset['repair_age'].apply(lambda x: handle_age(x))
        x_dataset['third_party_1_age'] = x_dataset['third_party_1_age'].apply(lambda x: handle_age(x))
        x_dataset['third_party_2_age'] = x_dataset['third_party_2_age'].apply(lambda x: handle_age(x))
        x_dataset['third_party_3_age'] = x_dataset['third_party_3_age'].apply(lambda x: handle_age(x))
        
        x_dataset['policy_coverage_1000'] = x_dataset['policy_coverage_1000'].apply(lambda x: handle_policy_coverage(x))
        
        x_dataset['claim_postal_code'] = x_dataset['claim_postal_code'].apply(lambda x: handle_categorical_grouping(x, claim_postal_code_list))
        x_dataset['policy_holder_postal_code'] = x_dataset['policy_holder_postal_code'].apply(lambda x: handle_categorical_grouping(x, policy_holder_postal_code_list))
        x_dataset['driver_postal_code'] = x_dataset['driver_postal_code'].apply(lambda x: handle_categorical_grouping(x, driver_postal_code_list))
        x_dataset['third_party_1_postal_code'] = x_dataset['third_party_1_postal_code'].apply(lambda x: handle_categorical_grouping(x, third_party_1_postal_code_list))
        x_dataset['third_party_2_postal_code'] = x_dataset['third_party_2_postal_code'].apply(lambda x: handle_categorical_grouping(x, third_party_2_postal_code_list))
        x_dataset['third_party_3_postal_code'] = x_dataset['third_party_3_postal_code'].apply(lambda x: x if x == 'unknown' else 'other')
        x_dataset['repair_postal_code'] = x_dataset['repair_postal_code'].apply(lambda x: handle_categorical_grouping(x, repair_postal_code_list))
        x_dataset['claim_vehicle_brand'] = x_dataset['claim_vehicle_brand'].apply(lambda x: handle_categorical_grouping(x, claim_vehicle_brand_list))
        x_dataset['policy_coverage_type'] = x_dataset['policy_coverage_type'].apply(lambda x: handle_categorical_grouping(x, policy_coverage_type_list))
        return x_dataset.drop(['third_party_1_id_known', 'third_party_2_id_known', 'third_party_3_id_known'], axis = 1)

In [83]:
# Some transformation
data = transform(data)
data_test = transform(data_test)

# Sanity check: fraud, claim_amount only
set(data.columns) - set(data_test.columns)

{'claim_amount', 'fraud'}

In [84]:
# Create dataset
X, y = data.drop(['claim_id', 'fraud'], axis = 1), data['fraud'].apply(lambda x: 1 if x == 'Y' else 0)
cv = 5

### **<font color='blue'>Random forest model with cross validation</font>**
- <font color='blue'>Based on the results of cross validation, it is found that kNN = 7 for SMOTE, 500 trees and 90 number of features are the best hyperparameters in terms of weighted precision.</font>

In [64]:
# Stratified cross-validation for imbalanced dataset
skf = StratifiedKFold(n_splits = cv)

def gridcv(X, y, k_neighbors, max_features, n_estimators):
    precision_weighted_score = []
    recall_weighted_score = []
    total_sum = np.array([])
    for train_index, test_index in skf.split(X, y):
        w, Xc = X[['claim_amount']], X.drop(['claim_amount'], axis = 1)
        
        # train-test split
        w_train, Xc_train, y_train = w.iloc[train_index], Xc.iloc[train_index], y[train_index]
        w_test, Xc_test, y_test = w.iloc[test_index], Xc.iloc[test_index], y[test_index]
        cont_features = Xc_train.columns[Xc_train.dtypes != np.dtype('O')]
        cat_features = Xc_train.columns[Xc_train.dtypes == np.dtype('O')]
        
        # pipeline
        cont_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
            ('scaler', StandardScaler())])
        cat_transformer = OneHotEncoder(handle_unknown = 'ignore')
        preprocessor = ColumnTransformer(transformers = [('cont', cont_transformer, cont_features),
            ('cat', cat_transformer, cat_features)])
        pipe = Pipeline([('preprocessor', preprocessor), ('upsampling', SMOTE(random_state = 99, k_neighbors = k_neighbors)),
                         ('classifier',  
                         CalibratedClassifierCV(base_estimator = RandomForestClassifier(random_state = 99, 
                        max_features = max_features, n_estimators = n_estimators, n_jobs = -1), cv = 2))])
        
        pipe.fit(Xc_train, y_train)
        y_pred = pipe.predict_proba(Xc_test)[:, 1] > 0.5
        y_predt = pipe.predict(Xc_test)
        
        precision_weighted_score.append(precision_score(y_test, y_predt, sample_weight = w_test.to_numpy().ravel()))
        recall_weighted_score.append(recall_score(y_test, y_predt, sample_weight = w_test.to_numpy().ravel()))
        total_sum = np.concatenate((w_test.to_numpy().ravel()[y_pred & y_test], total_sum))
        
    N = min(100, total_sum.shape[0])    
    return {'k_neighbors': k_neighbors, 'max_features': max_features, 'ntree': n_estimators, 
            'cv_precision_score': np.mean(precision_weighted_score), 'cv_recall_score': np.mean(recall_weighted_score),
           'sum_top100': np.sum(-np.sort(-total_sum)[:N])}
        
# hyperparameter
smote_knn_param = [3, 5, 7, 9]
rf_param_nfeatures = [10, 30, 50, 70, 90]
rf_param_ntrees = [250, 500]
i = 1
ttl = len(smote_knn_param) * len(rf_param_nfeatures) * len(rf_param_ntrees)

for k in smote_knn_param:
    for m in rf_param_nfeatures:
        for n in rf_param_ntrees:
            results = gridcv(X, y, k, m, n)
            print(results)
            print(i/ttl)
            i += 1

{'k_neighbors': 3, 'max_features': 10, 'ntree': 250, 'cv_precision_score': 0.7455737219433592, 'cv_recall_score': 0.2264879385423261, 'sum_top100': 464370.61}
0.025
{'k_neighbors': 3, 'max_features': 10, 'ntree': 500, 'cv_precision_score': 0.7475211806717527, 'cv_recall_score': 0.2356801778874867, 'sum_top100': 487753.94000000006}
0.05
{'k_neighbors': 3, 'max_features': 30, 'ntree': 250, 'cv_precision_score': 0.7153429108777435, 'cv_recall_score': 0.26832584723001046, 'sum_top100': 557906.56}
0.075
{'k_neighbors': 3, 'max_features': 30, 'ntree': 500, 'cv_precision_score': 0.7013834734636387, 'cv_recall_score': 0.2633603686927912, 'sum_top100': 548792.89}
0.1
{'k_neighbors': 3, 'max_features': 50, 'ntree': 250, 'cv_precision_score': 0.6620747566708567, 'cv_recall_score': 0.2932621487179879, 'sum_top100': 611006.22}
0.125
{'k_neighbors': 3, 'max_features': 50, 'ntree': 500, 'cv_precision_score': 0.6659478720222229, 'cv_recall_score': 0.3075232360238219, 'sum_top100': 633126.89}
0.15
{'k_

### **<font color='blue'>Gradient boosting model with cross validation</font>**

In [None]:
def gridcv_xgb(X, y, k_neighbors, max_features, n_estimators):
    precision_weighted_score = []
    recall_weighted_score = []
    for train_index, test_index in skf.split(X, y):
        w, Xc = X[['claim_amount']], X.drop(['claim_amount'], axis = 1)
        
        # train-test split
        w_train, Xc_train, y_train = w.iloc[train_index], Xc.iloc[train_index], y[train_index]
        w_test, Xc_test, y_test = w.iloc[test_index], Xc.iloc[test_index], y[test_index]
        cont_features = Xc_train.columns[Xc_train.dtypes != np.dtype('O')]
        cat_features = Xc_train.columns[Xc_train.dtypes == np.dtype('O')]
        
        # pipeline
        cont_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
            ('scaler', StandardScaler())])
        cat_transformer = OneHotEncoder(handle_unknown =  'ignore')
        preprocessor = ColumnTransformer(transformers = [('cont', cont_transformer, cont_features),
            ('cat', cat_transformer, cat_features)])
        pipe = Pipeline([('preprocessor', preprocessor), ('upsampling', SMOTE(random_state = 99, k_neighbors = k_neighbors)),
                         ('classifier',  
                         CalibratedClassifierCV(base_estimator = XGBClassifier(), cv = 2))])
        pipe.fit(Xc_train, y_train)
        y_pred = pipe.predict(Xc_test)
        precision_weighted_score.append(precision_score(y_test, y_pred, sample_weight = w_test.to_numpy().ravel()))
        recall_weighted_score.append(recall_score(y_test, y_pred, sample_weight = w_test.to_numpy().ravel()))
    print(precision_weighted_score)
    print(recall_weighted_score)
    return {'k_neighbors': k_neighbors, 'max_features': max_features, 'ntree': n_estimators, 
            'cv_precision_score': np.mean(precision_weighted_score), 'cv_recall_score': np.mean(recall_weighted_score)}
        
# hyperparameter
smote_knn_param = [3, 5, 9]
rf_param_nfeatures = [10, 30, 50, 70]
rf_param_ntrees = [50, 100, 150, 200]
i = 1
ttl = len(smote_knn_param) * len(rf_param_nfeatures) * len(rf_param_ntrees)

for k in smote_knn_param:
    for m in rf_param_nfeatures:
        for n in rf_param_ntrees:
            results = gridcv_xgb(X, y, k, m, n)
            print(results)
            print(i/ttl)
            i += 1

### **<font color='blue'>Prediction</font>**

#### **<font color='blue'>Random forest</font>**

In [104]:
X_test = data_test.drop(['claim_id'], axis = 1)
X_train = X.drop(['claim_amount'], axis = 1)
# Identify continuous and categorical features
cont_features = X_train.columns[X_train.dtypes != np.dtype('O')]
cat_features = X_train.columns[X_train.dtypes == np.dtype('O')]

# Pipeline
cont_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())])
cat_transformer = OneHotEncoder(handle_unknown =  'ignore')
preprocessor = ColumnTransformer(transformers = [('cont', cont_transformer, cont_features),
        ('cat', cat_transformer, cat_features)])
pipe = Pipeline([('preprocessor', preprocessor), 
                 ('upsampling', SMOTE(random_state = 99, k_neighbors = 7)),
                 ('classifier', CalibratedClassifierCV(base_estimator = RandomForestClassifier(random_state = 99, 
                        max_features = 90, n_estimators = 500, n_jobs = -1), cv = 5))])
# fit
pipe.fit(X_train, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['claim_num_injured', 'claim_num_third_parties', 'claim_num_vehicles',
       'claim_vehicle_cyl', 'claim_vehicle_id_known', 'claim_vehicle_load',
       'claim_vehicle_power', 'days_before_occured', 'da...
       'third_party_2_vehicle_type', 'third_party_3_age',
       'third_party_3_country', 'third_party_3_form', 'third_party_3_injured',
       'third_party_3_postal_code', 'third_party_3_vehicle_type'],
      dtype='object'))])),
                ('upsampling', SMOTE(k_neighbors=7, random_state=99)),

In [105]:
# Make prediction
y_pred = pipe.predict_proba(X_test)

In [106]:
# Dataframe which contains the results
pred = pd.DataFrame()
pred['ID'] = data_test['claim_id']
pred['PROB'] = y_pred[:, 1]

In [108]:
# Export data
pred.to_csv(path + r'\results_randomforest.csv', index = False)

---

In [107]:
pred[pred['PROB']> 0.5]

Unnamed: 0,ID,PROB
552,66021,0.552708
840,66309,0.99456
1283,66752,0.812249
2409,67878,0.998919
3168,68637,0.535155
3395,68864,0.509014
3404,68873,0.920987
4113,69582,0.999151
4345,69814,0.977088
4346,69815,0.99098


In [98]:
results = gridcv(X, y, 7, 90, 500)

In [99]:
results

{'k_neighbors': 7,
 'max_features': 90,
 'ntree': 500,
 'cv_precision_score': 0.6470959471828133,
 'cv_recall_score': 0.36234619307638044,
 'sum_top100': 746958.2199999999}