In [0]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_curve, auc
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import learning_curve, GridSearchCV




In [0]:
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [0]:
from google.colab import files
uploaded = files.upload()

Saving test_QaJU1Mh.csv to test_QaJU1Mh.csv
Saving train_jRxnrHD.csv to train_jRxnrHD.csv


In [0]:
import io
test = pd.read_csv(io.BytesIO(uploaded['test_QaJU1Mh.csv']))

train = pd.read_csv(io.BytesIO(uploaded['train_jRxnrHD.csv']))

In [0]:
# Filling Missing Values


In [0]:
test['application_underwriting_score'].fillna(test['application_underwriting_score'].mean(), inplace=True)
test['Count_3-6_months_late'].fillna(0, inplace=True)
test['Count_6-12_months_late'].fillna(0, inplace=True)
test['Count_more_than_12_months_late'].fillna(0, inplace=True)
#test.isnull().sum()

train['application_underwriting_score'].fillna(train['application_underwriting_score'].mean(), inplace=True)
train['Count_3-6_months_late'].fillna(0, inplace=True)
train['Count_6-12_months_late'].fillna(0, inplace=True)
train['Count_more_than_12_months_late'].fillna(0, inplace=True)
#train.isnull().sum()

In [0]:
# Outlier Removal

In [0]:
train.loc[train['age_in_days']>34000,'age_in_days']=np.mean(train['age_in_days'])
test.loc[test['age_in_days']>34000,'age_in_days']=np.mean(test['age_in_days'])
train['no_of_premiums_paid']=np.sqrt(train['no_of_premiums_paid'])
test['no_of_premiums_paid']=np.log(test['no_of_premiums_paid'])
train['premium']=np.power(train['premium'],1/2)
train['Income']=np.log(train['Income'])
test['Income']=np.log(test['Income'])
train['application_underwriting_score']=np.sqrt(train['application_underwriting_score'])
test['application_underwriting_score']=np.sqrt(test['application_underwriting_score'])
                          

In [0]:
# Linear Regression

In [0]:
train_set=train.copy()

train_set=train_set.drop('target',axis=1)
train_set=train_set.drop('id',axis=1)
train_set=train_set.drop('Count_3-6_months_late',axis=1)
train_set=train_set.drop('Count_6-12_months_late',axis=1)
train_set=train_set.drop('Count_more_than_12_months_late',axis=1)


train_set=pd.get_dummies(train_set)


In [0]:
x1=train_set.drop('premium',axis=1)
y1=train_set['premium']


In [0]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(x1,y1, random_state=42, stratify=y1)

In [0]:
from sklearn.linear_model import LinearRegression
lreg=LinearRegression()


lreg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [0]:
pred=lreg.predict(x_test)

In [0]:
lreg.score(x_test,y_test)

0.48322054163235295

In [0]:
lreg.score(x_train,y_train)

0.48499256881273883

In [0]:
rmse_test=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(pred)),2)))
rmse_train=np.sqrt(np.mean(np.power((np.array(y_train)-np.array(lreg.predict(x_train))),2)))

In [0]:
print(rmse_test)
print(rmse_train)

28.70763703936227
28.65980641298938


In [0]:
# Prediction on actual Test Data

In [0]:
test_run=test.copy()
test_run=test_run.drop('id', axis=1)
test_run=test_run.drop('Count_3-6_months_late',axis=1)
test_run=test_run.drop('Count_6-12_months_late',axis=1)
test_run=test_run.drop('Count_more_than_12_months_late',axis=1)

In [0]:
test_run=pd.get_dummies(test_run)
prediction=lreg.predict(test_run)
test['premium']=prediction
test['premium']=np.power(test['premium'],1/3)

In [0]:
# XGBoost Model

In [0]:
dtrain=train.copy()
#train1=train1.drop('id',axis=1)
#train1=train1.drop('sourcing_channel',axis=1)
#train1=train1.drop('perc_premium_paid_by_cash_credit',axis=1)

dtrain=pd.get_dummies(dtrain)


In [0]:
target = dtrain['target']
IDcol = dtrain['id']

In [0]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
 if useTrainCV:
  xgb_param = alg.get_xgb_params()
  xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['target'].values)
  cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
  metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
  alg.set_params(n_estimators=cvresult.shape[1])
  
  #Fit the algorithm on the data
  alg.fit(dtrain[predictors], dtrain['target'],eval_metric='auc')
  
  #Predict training set:
  dtrain_predictions = alg.predict(dtrain[predictors])
  dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
  #Print model report:
  print ("\nModel Report")
  print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['target'].values, dtrain_predictions))
  print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['target'], dtrain_predprob))
    
         
  featimp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
  featimp.plot(kind='bar', title='Feature Importances')
  plt.ylabel('Feature Importance Score')
  
 
        
    

In [0]:
train2=dtrain.copy()
train2=train2.drop('target', axis=1)
train2=train2.drop('id', axis=1)
train2=pd.get_dummies(train2)

In [0]:
 #Choose all predictors except target & IDcols
  predictors = train2.copy()
  xgb1 = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, 
  scale_pos_weight=1, seed=27)
  modelfit(xgb1, train2, predictors)
  


ValueError: ignored