In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer, KNNImputer #, IterativeImputer
from statsmodels.imputation.mice import MICE
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, auc
%matplotlib inline

In [3]:
training_data = pd.read_csv("./Data/train.csv")

In [4]:
training_data.shape

(957919, 120)

In [5]:
training_data.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
0,0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1
3,3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1
4,4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,,0.23049,1


In [14]:
# Split the data into training and test data, 90% for training and 10% for testing; also shuffles the data
# the 90% training data will be used for cross validation model training
train_data, test_data = train_test_split(training_data, test_size = 0.1, random_state = 42) 

In [None]:
y_train = train_data.claim.values
X_train = train_data.drop(columns = ['id', 'claim']).values

# Replace missing values in each column with the median value (since none of the columns appear to be correlated)
# May change this to KNN later, and some columns are not appropriate for median-based imputation, 118 KNNs does seem a bit excessive
# Non-simple imputation: f2, f5, f29, f40, f42, f65, f70, f74, f75, f91
c_pipe = Pipeline(steps = [('imputer', SimpleImputer()), 
                           ('scaler', StandardScaler())])
# ('model', XGBClassifier(objective = 'binary:logistic', eval_metric = 'auc', use_label_encoder = False))
xgbc = XGBClassifier(objective = 'binary:logistic', eval_metric = 'auc', use_label_encoder = False)
param_grid = {'n_estimators': [50, 100, 150]}# , 
              # 'learning_rate': [0.01, 0.05, 0.1, 0.3], 
              # 'min_split_loss': [0, 0.01, 0.05, 0.1], 
              # 'max_depth': [5, 6, 7, 8]} # , 
              # 'subsample': [0.5, 0.7, 0.85, 1],
              # 'colsample_bytree': [0.5, 0.7, 0.85, 1], 
              # 'red_alpha': [0, 0.1, 0.2, 0.3], 
              # 'reg_lambda': [1, 0.85, 0.7, 0.5]}
gscv = GridSearchCV(xgbc, param_grid, cv = 5, verbose = 3, return_train_score = True, scoring = roc_auc_score, n_jobs = 2)

In [None]:
# eta = 0.3, max_depth = 10, subsample = 1, alpha = 0, lambda  1
X_train = c_pipe.fit_transform(X_train)
gscv.fit(X = X_train, y = y_train)

In [None]:
y_test = test_data.claim.values
X_test = test_data.drop(columns = ['id', 'claim']).values
X_test = c_pipe.transform(X_test)
y_preds = gscv.predict(X_test)
roc_auc_score(y_test, y_preds)

In [None]:
test_data['claim_preds'] = c_pipe.predict(test_data.drop(columns = ['id', 'claim']).values)

In [None]:
roc_auc_score(test_data.claim, test_data.claim_preds)

In [None]:
test_data = pd.read_csv("test.csv")
test_data['claim'] = c_pipe.predict(test_data.drop(columns = ['id']).values)