In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Reading Data

In [2]:
cleaned = pd.read_csv('datasets/cleaned_Izends_Data_Thru201712_ver5.csv',index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
cleaned[cleaned['EstimateTotal'] < 0]

Unnamed: 0,DisplayName,DivisionName,City,Zip,State,LossYearMo,StartedFlag,CommOrRes,NoteCount,PhotoCount,...,Estimate_MaterialSaleTax,Estimate_OverHead,Estimate_Profit,Estimate_PctOverhead,Estimate_PctProfit,Estimate_Deductible,Estimate_BaseSvcCharge,CleanAddressFranchisorID,CleanAddressContactID,LossMo


In [4]:
cleaned.head()

Unnamed: 0,DisplayName,DivisionName,City,Zip,State,LossYearMo,StartedFlag,CommOrRes,NoteCount,PhotoCount,...,Estimate_MaterialSaleTax,Estimate_OverHead,Estimate_Profit,Estimate_PctOverhead,Estimate_PctProfit,Estimate_Deductible,Estimate_BaseSvcCharge,CleanAddressFranchisorID,CleanAddressContactID,LossMo
2,Consulting,Inventory,dallas,75243,TX,201712,0,Residential,1.0,0,...,0,0,0,0,0,0,0,7,66150,12
12,Contents,Lyons Textile Restoration,charlotte,28227,NC,201405,0,Residential,20.0,99,...,0,0,0,0,0,0,0,7,2058,5
20,Contents,Lyons Textile Restoration,statesville,28677,NC,201410,0,Residential,7.0,6,...,0,0,0,0,0,0,0,7,3658,10
21,Contents,Lyons Textile Restoration,woodruff,29388,SC,201410,1,Residential,18.0,81,...,0,0,0,0,0,0,0,7,3704,10
25,Contents,Lyons Textile Restoration,hillsborough,27278,NC,201412,1,Residential,25.0,109,...,0,0,0,0,0,0,0,7,4139,12


# MODEL 1(XGBoost)

## Using the following features:
    
1) CommOrRes  
2) JobCount  
3) ClaimCount  
4) NoteCount  
5) PhotoCount  
6) PolicyHolderType  

#### Making Dummies

In [5]:
cleaned_XY = cleaned[['CommOrRes','PolicyHolderType','NoteCount','PhotoCount','JobCount','ClaimCount','EstimateTotal','LossYearMo']]

In [6]:
cleaned_XY.nunique(), cleaned_XY.dtypes

(CommOrRes                3
 PolicyHolderType         2
 NoteCount              378
 PhotoCount            1313
 JobCount                21
 ClaimCount               8
 EstimateTotal       506482
 LossYearMo             177
 dtype: int64, CommOrRes            object
 PolicyHolderType     object
 NoteCount           float64
 PhotoCount            int64
 JobCount              int64
 ClaimCount            int64
 EstimateTotal       float64
 LossYearMo            int64
 dtype: object)

In [7]:
cleaned_XY_dummies = pd.get_dummies(cleaned_XY,sparse=True)

#### Separating 2017 Test Data

In [9]:
test2017=cleaned_XY_dummies[cleaned_XY_dummies["LossYearMo"].apply(lambda x:x>=201701)]

In [10]:
learning=cleaned_XY_dummies[cleaned_XY_dummies["LossYearMo"].apply(lambda x:x<201701)]

In [11]:
learning.columns

Index(['NoteCount', 'PhotoCount', 'JobCount', 'ClaimCount', 'EstimateTotal',
       'LossYearMo', 'CommOrRes_Commercial', 'CommOrRes_Other',
       'CommOrRes_Residential', 'PolicyHolderType_Company',
       'PolicyHolderType_Individual'],
      dtype='object')

In [12]:
X = learning[learning.columns.difference(['LossYearMo','EstimateTotal'])]

In [13]:
X.head()

Unnamed: 0,ClaimCount,CommOrRes_Commercial,CommOrRes_Other,CommOrRes_Residential,JobCount,NoteCount,PhotoCount,PolicyHolderType_Company,PolicyHolderType_Individual
12,1,0,0,1,1,20.0,99,0,1
20,1,0,0,1,1,7.0,6,0,1
21,1,0,0,1,1,18.0,81,0,1
25,1,0,0,1,1,25.0,109,0,1
26,1,0,0,1,1,16.0,49,0,1


In [14]:
y_bins = learning['EstimateTotal']

In [15]:
y_bins.head()

12    5149.060
20     211.520
21    5010.550
25   11523.380
26     895.130
Name: EstimateTotal, dtype: float64

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y_bins, test_size=0.33, random_state=42)

In [17]:
test2017_X = test2017[test2017.columns.difference(['LossYearMo','EstimateTotal'])]

In [19]:
test2017_y = test2017['EstimateTotal']

#### Modelling

In [20]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

In [21]:
alg = XGBRegressor( colsample_bytree=0.8, gamma=0, learning_rate=0.1,
    max_depth=5, min_child_weight=1, n_estimators=5000, objective="reg:linear",
    reg_alpha=0, reg_lambda=0,
    seed=7, silent=True, subsample=0.8,n_jobs=-1)

In [22]:
eval_set = [(X_val,y_val)]

In [23]:
alg.fit(X_train, y_train,eval_metric='rmse', early_stopping_rounds=50, eval_set = eval_set, verbose = True)

[0]	validation_0-rmse:8902.4
Will train until validation_0-rmse hasn't improved in 50 rounds.
[1]	validation_0-rmse:8578.52
[2]	validation_0-rmse:8305.14
[3]	validation_0-rmse:8076.28
[4]	validation_0-rmse:7884.21
[5]	validation_0-rmse:7725.07
[6]	validation_0-rmse:7603.35
[7]	validation_0-rmse:7493.74
[8]	validation_0-rmse:7403.05
[9]	validation_0-rmse:7327.86
[10]	validation_0-rmse:7265.56
[11]	validation_0-rmse:7212.59
[12]	validation_0-rmse:7169.46
[13]	validation_0-rmse:7137.8
[14]	validation_0-rmse:7108.83
[15]	validation_0-rmse:7085.25
[16]	validation_0-rmse:7065.07
[17]	validation_0-rmse:7050.31
[18]	validation_0-rmse:7036.97
[19]	validation_0-rmse:7026.17
[20]	validation_0-rmse:7017.69
[21]	validation_0-rmse:7010.03
[22]	validation_0-rmse:7005.5
[23]	validation_0-rmse:7000.54
[24]	validation_0-rmse:6996.58
[25]	validation_0-rmse:6992.35
[26]	validation_0-rmse:6988.27
[27]	validation_0-rmse:6984.62
[28]	validation_0-rmse:6982.74
[29]	validation_0-rmse:6980.19
[30]	validation_0-

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=5000,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=0, scale_pos_weight=1, seed=7, silent=True,
       subsample=0.8)

In [24]:
test2017_preds = alg.predict(test2017_X)

In [25]:
test2017_preds

array([6032.6577, 6985.013 , 6609.43  , ..., 5435.741 , 3782.5723,
       3654.7236], dtype=float32)

In [29]:
from sklearn.metrics import mean_squared_error
test2017_mse = mean_squared_error(test2017_y, test2017_preds)

In [30]:
test2017_rmse = test2017_mse ** (0.5)

In [33]:
pct_error = test2017_rmse / test2017_y.mean()
pct_error * 100

125.84689446485409

# MODEL 2 (Keras): with 1lakh data

## Using the following features:
    
1) CommOrRes  
2) JobCount  
3) ClaimCount  
4) NoteCount  
5) PhotoCount  
6) PolicyHolderType  
7) Display Name  
8) Division Name  
9) Month

In [55]:
cleaned_XY_mod2 = cleaned[['DisplayName', 'DivisionName','LossMo','CommOrRes','PolicyHolderType','NoteCount','PhotoCount','JobCount','ClaimCount','EstimateTotal','LossYearMo']]

In [58]:
cleaned_XY_mod2_sample = cleaned_XY_mod2.sample(100000)

In [59]:
cleaned_XY_mod2_sample.nunique(), cleaned_XY_mod2_sample.dtypes

(DisplayName            17
 DivisionName          991
 LossMo                 12
 CommOrRes               3
 PolicyHolderType        2
 NoteCount             248
 PhotoCount            704
 JobCount                9
 ClaimCount              3
 EstimateTotal       86955
 LossYearMo            141
 dtype: int64, DisplayName          object
 DivisionName         object
 LossMo                int64
 CommOrRes            object
 PolicyHolderType     object
 NoteCount           float64
 PhotoCount            int64
 JobCount              int64
 ClaimCount            int64
 EstimateTotal       float64
 LossYearMo            int64
 dtype: object)

In [60]:
cleaned_XY_dummies_mod2_sample = pd.get_dummies(cleaned_XY_mod2_sample,sparse=True)

#### Separating 2017 Test Data

In [61]:
test2017_mod2=cleaned_XY_dummies_mod2_sample[cleaned_XY_dummies_mod2_sample["LossYearMo"].apply(lambda x:x>=201701)]

In [62]:
learning_mod2=cleaned_XY_dummies_mod2_sample[cleaned_XY_dummies_mod2_sample["LossYearMo"].apply(lambda x:x<201701)]

In [63]:
learning_mod2.columns

Index(['LossMo', 'NoteCount', 'PhotoCount', 'JobCount', 'ClaimCount',
       'EstimateTotal', 'LossYearMo', 'DisplayName_BioHazard',
       'DisplayName_Carpet Cleaning', 'DisplayName_Consulting',
       ...
       'DivisionName_xDemo', 'DivisionName_xDuct', 'DivisionName_xFire',
       'DivisionName_xMold Remediation', 'DivisionName_xWater Mitigation',
       'CommOrRes_Commercial', 'CommOrRes_Other', 'CommOrRes_Residential',
       'PolicyHolderType_Company', 'PolicyHolderType_Individual'],
      dtype='object', length=1020)

In [64]:
X_mod2 = learning_mod2[learning_mod2.columns.difference(['LossYearMo','EstimateTotal'])]

In [65]:
X_mod2.head()

Unnamed: 0,ClaimCount,CommOrRes_Commercial,CommOrRes_Other,CommOrRes_Residential,DisplayName_BioHazard,DisplayName_Carpet Cleaning,DisplayName_Consulting,DisplayName_Contents,DisplayName_Default,DisplayName_Fire Damage,...,DivisionName_xDuct,DivisionName_xFire,DivisionName_xMold Remediation,DivisionName_xWater Mitigation,JobCount,LossMo,NoteCount,PhotoCount,PolicyHolderType_Company,PolicyHolderType_Individual
395941,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,11,12.0,1,1,0
504582,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,7,6.0,1,0,1
1472532,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,8,15.0,157,1,0
400546,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,12,11.0,1,0,1
205886,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,12,55.0,10,0,1


In [66]:
y_mod2 = learning_mod2['EstimateTotal']

In [67]:
y_mod2.head()

395941    6000.000
504582    8552.490
1472532   2812.580
400546    1004.012
205886    3505.550
Name: EstimateTotal, dtype: float64

In [68]:
from sklearn.model_selection import train_test_split
X_train_mod2, X_val_mod2, y_train_mod2, y_val_mod2 = train_test_split(X_mod2, y_mod2, test_size=0.33, random_state=42)

In [69]:
test2017_X_mod2 = test2017_mod2[test2017_mod2.columns.difference(['LossYearMo','EstimateTotal'])]

In [70]:
test2017_y_mod2 = test2017_mod2['EstimateTotal']

#### Modelling

In [71]:
import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, History 

from sklearn import metrics
from sklearn.metrics import roc_auc_score

In [72]:
model = Sequential()
model.add(Dense(512, input_shape=(X_train_mod2.shape[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.25))
# if(best_params['num_layers'] == 'two_hidden'):
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(1))
model.add(Activation('linear'))
model.compile(loss='mse', metrics=['mse'],
              optimizer='rmsprop')

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)
history = History()

In [73]:
model.fit(X_train_mod2, y_train_mod2,
          batch_size=128,
          epochs=500,
          callbacks=[early_stop, history],
          verbose=2,
          validation_data=(X_val_mod2,y_val_mod2))

Train on 43645 samples, validate on 21497 samples
Epoch 1/500
 - 3s - loss: 56167184.1386 - mean_squared_error: 56167184.1386 - val_loss: 51104214.3415 - val_mean_squared_error: 51104214.3415
Epoch 2/500
 - 3s - loss: 49928807.2850 - mean_squared_error: 49928807.2850 - val_loss: 48434070.1488 - val_mean_squared_error: 48434070.1488
Epoch 3/500
 - 3s - loss: 47877746.0581 - mean_squared_error: 47877746.0581 - val_loss: 46903122.2273 - val_mean_squared_error: 46903122.2273
Epoch 4/500
 - 3s - loss: 47103133.1203 - mean_squared_error: 47103133.1203 - val_loss: 46523960.2331 - val_mean_squared_error: 46523960.2331
Epoch 5/500
 - 3s - loss: 46739512.7495 - mean_squared_error: 46739512.7495 - val_loss: 46169448.6420 - val_mean_squared_error: 46169448.6420
Epoch 6/500
 - 3s - loss: 46336959.2431 - mean_squared_error: 46336959.2431 - val_loss: 47163550.2175 - val_mean_squared_error: 47163550.2175
Epoch 7/500
 - 3s - loss: 46091191.5507 - mean_squared_error: 46091191.5507 - val_loss: 45786589.5

 - 3s - loss: 43193333.8516 - mean_squared_error: 43193333.8516 - val_loss: 43788320.7283 - val_mean_squared_error: 43788320.7283
Epoch 59/500
 - 3s - loss: 43165076.6558 - mean_squared_error: 43165076.6558 - val_loss: 43935864.8446 - val_mean_squared_error: 43935864.8446
Epoch 60/500
 - 3s - loss: 43222488.5373 - mean_squared_error: 43222488.5373 - val_loss: 43818424.3119 - val_mean_squared_error: 43818424.3119
Epoch 61/500
 - 3s - loss: 43179002.9903 - mean_squared_error: 43179002.9903 - val_loss: 43951816.8357 - val_mean_squared_error: 43951816.8357
Epoch 62/500
 - 3s - loss: 43250037.1814 - mean_squared_error: 43250037.1814 - val_loss: 43832587.6387 - val_mean_squared_error: 43832587.6387
Epoch 63/500
 - 3s - loss: 43122316.8542 - mean_squared_error: 43122316.8542 - val_loss: 43917646.9924 - val_mean_squared_error: 43917646.9924
Epoch 64/500
 - 3s - loss: 43097395.6158 - mean_squared_error: 43097395.6158 - val_loss: 44199740.4576 - val_mean_squared_error: 44199740.4576
Epoch 65/500

Epoch 116/500
 - 3s - loss: 42194633.5187 - mean_squared_error: 42194633.5187 - val_loss: 43800942.4782 - val_mean_squared_error: 43800942.4782
Epoch 117/500
 - 3s - loss: 42193181.6078 - mean_squared_error: 42193181.6078 - val_loss: 43943403.7839 - val_mean_squared_error: 43943403.7839
Epoch 118/500
 - 3s - loss: 42242774.8298 - mean_squared_error: 42242774.8298 - val_loss: 43810215.3563 - val_mean_squared_error: 43810215.3563
Epoch 119/500
 - 3s - loss: 42097015.3235 - mean_squared_error: 42097015.3235 - val_loss: 43993018.4566 - val_mean_squared_error: 43993018.4566
Epoch 120/500
 - 3s - loss: 42163947.0892 - mean_squared_error: 42163947.0892 - val_loss: 44423786.7220 - val_mean_squared_error: 44423786.7220
Epoch 121/500
 - 3s - loss: 42310217.0892 - mean_squared_error: 42310217.0892 - val_loss: 43888193.5348 - val_mean_squared_error: 43888193.5348
Epoch 122/500
 - 3s - loss: 42345330.1547 - mean_squared_error: 42345330.1547 - val_loss: 43691675.4927 - val_mean_squared_error: 436916

<keras.callbacks.History at 0x7f43b1a28a20>

In [76]:
loss,mse = model.evaluate(test2017_X_mod2,test2017_y_mod2)



In [79]:
rmse= mse ** 0.5

In [84]:
pct_error_mod2 = rmse/test2017_y_mod2.mean()
pct_error_mod2 * 100

119.24389558470088