In [1]:
# Packages
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
# from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, SMOTENC
from sklearn.base import BaseEstimator
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, top_k_accuracy_score, precision_score, make_scorer

In [2]:
# I am using the lateset sklearn version
import sklearn
sklearn.__version__

'0.24.1'

In [3]:
# Print more rows and columns of pandas.DataFrame
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [4]:
# Change path if needed
path = r'C:\Users\user\Desktop\KUL - Mstat\Big Data Platforms and Technologies\project'
data = pd.read_csv(path + r'\ctrain.csv')
data_test = pd.read_csv(path + r'\ctest.csv')

In [5]:
def handle_age(value):
    # A simple program to discretize age
    if pd.isna(value):
        return 'unknown'
    else:
        if value <= 20:
            return '<=20'
        elif value <= 40:
            return '<=40'
        elif value <= 60:
            return '<=60'
        else:
            return '>60'
        
def transform(x_dataset):
        x_dataset['driver_age'] = x_dataset['driver_age'].apply(lambda x: handle_age(x))
        x_dataset['policy_holder_age'] = x_dataset['policy_holder_age'].apply(lambda x: handle_age(x))
        x_dataset['repair_age'] = x_dataset['repair_age'].apply(lambda x: handle_age(x))
        x_dataset['third_party_1_age'] = x_dataset['third_party_1_age'].apply(lambda x: handle_age(x))
        x_dataset['third_party_2_age'] = x_dataset['third_party_2_age'].apply(lambda x: handle_age(x))
        x_dataset['third_party_3_age'] = x_dataset['third_party_3_age'].apply(lambda x: handle_age(x))
        return x_dataset.drop(['third_party_1_id_known', 'third_party_2_id_known', 'third_party_3_id_known'], axis = 1)

In [6]:
# Some transformation
data = transform(data)
data_test = transform(data_test)

In [7]:
# Create dataset
X, y = data.drop(['claim_id', 'fraud'], axis = 1), data['fraud'].apply(lambda x: 1 if x == 'Y' else 0)
cv = 5

### **<font color='blue'>Random forest model</font>**


In [23]:
# Stratified cross-validation for imbalanced dataset
skf = StratifiedKFold(n_splits = cv)
score = []

for train_index, test_index in skf.split(X, y):
    w, Xc = X[['claim_amount']], X.drop(['claim_amount'], axis = 1)
    # train-test split
    w_train, Xc_train, y_train = w.iloc[train_index], Xc.iloc[train_index], y[train_index]
    w_test, Xc_test, y_test = w.iloc[test_index], Xc.iloc[test_index], y[test_index]
    cont_features = Xc_train.columns[Xc_train.dtypes != np.dtype('O')]
    cat_features = Xc_train.columns[Xc_train.dtypes == np.dtype('O')]
    # pipeline
    cont_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())])
    cat_transformer = OneHotEncoder(handle_unknown =  'ignore')
    preprocessor = ColumnTransformer(transformers = [('cont', cont_transformer, cont_features),
        ('cat', cat_transformer, cat_features)])
    pipe = Pipeline([('preprocessor', preprocessor), ('upsampling', SMOTE(random_state = 99)),
                    ('classifier', RandomForestClassifier())])
    pipe_weight = Pipeline([('upsampling', SMOTE(random_state = 99))])
    # fit
    sample_weights = np.array(pipe_weight.fit_resample(w_train, y_train)[0])
    pipe.fit(Xc_train, y_train, **{'classifier__sample_weight': sample_weights.ravel()})
    pipe_prob_calibration = CalibratedClassifierCV(pipe, cv = 'prefit')
    pipe_prob_calibration.fit(Xc_train, y_train, sample_weight = w_train.to_numpy().ravel())
    y_pred = pipe_prob_calibration.predict(Xc_test)
    score.append(precision_score(y_test, y_pred, sample_weight = w_test.to_numpy().ravel()))

ValueError: could not convert string to float: 'unknown'

In [22]:
score

array([1974.31, 1967.3 , 1938.82, ...,  662.69,  564.  ,  193.  ])

In [None]:
X_test = data_test.drop(['claim_id'], axis = 1)
w_f, X_train_f = X[['claim_amount']], X.drop(['claim_amount'], axis = 1)
cont_features = X_train_f.columns[X_train_f.dtypes != np.dtype('O')]
cat_features = X_train_f.columns[X_train_f.dtypes == np.dtype('O')]
# pipeline
cont_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())])
cat_transformer = OneHotEncoder(handle_unknown =  'ignore')
preprocessor = ColumnTransformer(transformers = [('cont', cont_transformer, cont_features),
        ('cat', cat_transformer, cat_features)])
pipe = Pipeline([('preprocessor', preprocessor), ('upsampling', SMOTE(random_state = 99)),
                    ('classifier', RandomForestClassifier())])
pipe_weight = Pipeline([('upsampling', SMOTE(random_state = 99))])
# fit
sample_weights = np.array(pipe_weight.fit_resample(w_f, y)[0])
pipe.fit(X_train_f, y, **{'classifier__sample_weight': sample_weights.ravel()})

In [None]:
# Make prediction
y_pred = pipe.predict_proba(X_test)

In [None]:
# Dataframe which contains the results
pred = pd.DataFrame()
pred['ID'] = data_test['claim_id']
pred['PROB'] = y_pred[:, 1]

In [None]:
# Export data
pred.to_csv(path + r'\results.csv', index = False)