In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
kaggle_json_path = '/content/drive/My Drive/TEMP/kaggle.json'
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/My Drive/TEMP/"

!kaggle competitions download -c allstate-claims-severity
!cp *.zip "/content/drive/My Drive/TEMP/digiledge-allstate-kaggle/"
!unzip "/content/drive/My Drive/TEMP/digiledge-allstate-kaggle/*.zip" -d "/content/drive/My Drive/TEMP/digiledge-allstate-kaggle/"

### **IMPORTS**

In [0]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

In [0]:
objects_dir = "/content/drive/My Drive/TEMP/digiledge-allstate-kaggle/objects/"  # to hold objects created on the fly
drive_data_dir = '/content/drive/My Drive/TEMP/digiledge-allstate-kaggle/'

train_data_path = drive_data_dir+'train.csv'
test_data_path = drive_data_dir+'test.csv'
submission_csv_path = drive_data_dir+'sample_submission.csv'

In [0]:

# Label Encode all categorical features
def get_labelEncoded_dataframes(drive_data_dir, object_dir):
  '''
  creates a label encoded dataframe out of the categorical features using sklearns's LabelEncoder
  saves new dataframe in object_dir
  skips creating new dataframe if already exists
  '''
  try:
    train_data = pd.read_csv(objects_dir+'train_label_encoded.csv')
    test_data = pd.read_csv(objects_dir + 'test_label_encoded.csv')
  except:
    print('Label Encoding categorical features . . .')
    train_data = pd.read_csv(drive_data_dir+'train.csv')
    test_data = pd.read_csv(drive_data_dir+'test.csv')
    cat_cols = [x for x in train_data.columns if x.startswith('cat')]

    for col in cat_cols:
        le = LabelEncoder()
        train_data[col] = le.fit_transform(train_data[col])
        # update::
        # Test data had some values in some cateogorical features that were unseen in train data
        # the next 2 lines fix that :|
        test_data[col] = test_data[col].map(lambda s: 'UNK' if s not in le.classes_ else s)
        le.classes_ = np.append(le.classes_, 'UNK')
        test_data[col] = le.transform(test_data[col])
    # save encoded train and test dataFrames to objects dir
    print('Saved Label Encoded features to', objects_dir + '*.csv')
    train_data.to_csv(objects_dir + 'train_label_encoded.csv', index=False)
    test_data.to_csv(objects_dir + 'test_label_encoded.csv', index=False)
  return train_data, test_data



train_data, test_data = get_labelEncoded_dataframes(drive_data_dir, objects_dir)
submission = pd.read_csv(submission_csv_path)


Since we will use train data pretty frequently to select features and build our model, using a handy name.  
"X" & "Y"

In [0]:
X = train_data.iloc[:,1:-1]
Y = train_data.iloc[:,-1]


# get categorical and continuous features names
cat_cols = [x for x in train_data.columns if x.startswith('cat')]
cont_cols = [x for x in train_data.columns if x.startswith('cont')]


# <br>  
  
### We make a benchmark xgboost model to see benchmark scores as we make changes to features.  
### This will validate the features using a 5 fold CV


In [0]:

def benchmark_xgb(X, Y, num_folds=5):
    dtrain = xgb.DMatrix(X, Y)
    params = {'eta': 0.01, 'seed':0, 'subsample': 0.5, 'colsample_bytree': 0.5, 
             'objective': 'reg:squarederror', 'max_depth':6, 'min_child_weight':3} 
    # Grid Search CV optimized settings
    num_rounds = 1000
    res = xgb.cv(params, dtrain, num_rounds, num_folds, metrics='mae' )
    return res


### Lets see MAE scores using only categorical features and only continuous features

In [0]:
"""
USAGE
results = benchmark_xgb(features, Y, num_folds=5)
print("Mean MAE : ", results['test-mae-mean'].mean())
"""

results_categorical_only = benchmark_xgb(X[cat_cols], Y, num_folds=2)
print("Mean MAE with only categorical features: ", results_categorical_only['test-mae-mean'].mean())

results_continuous_only = benchmark_xgb(X[cont_cols], Y, num_folds=2)
print("Mean MAE with only continuous features: ", results_continuous_only['test-mae-mean'].mean())

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Mean MAE with only categorical features:  1334.980274716999
Mean MAE with only continuous features:  1915.3965111769992


Since the last cell took pretty long to run on colab, I'll document it here:  
Mean MAE with only categorical features:  1334.980274716999  
Mean MAE with only continuous features:  1915.3965111769992  

# <br>  

### We will now try to reduce the dimensions of the features using PCA and see if the mean MAE improves/deteriorates
### Choosing n_components so as to retain 95~99% variance of the features




In [12]:
from sklearn.decomposition import PCA

retain_ratio = 0.99


def get_num_components(singular_values, retain_ratio = retain_ratio):
  '''
  function to choose num_components to retain for PCA
  '''
  k = 0
  retain_ratio = retain_ratio
  for i in range(len(singular_values)):
    if sum(singular_values[:i])/sum(singular_values) < retain_ratio:
      k -=- 1;
    else: break;
  return k-1


def get_reduced_features_pca(X, retain_ratio=retain_ratio):
  pca = PCA()
  pca.fit(X)
  sv = pca.singular_values_
  k = get_num_components(sv, retain_ratio)
  del(pca)
  pca = PCA(n_components=k)
  X_reduced = pca.fit_transform(X)  # all continuous features reduced after PCA
  print("Retaining {}% variance with {} components from PCA (out of a total of {} features)".format(
                                                                retain_ratio*100, k, X.shape[1]))
  return X_reduced


X_cat_reduced = get_reduced_features_pca(X[cat_cols])   # get reduced features for categorical features
X_cont_reduced = get_reduced_features_pca(X[cont_cols]) # get reduced features for continuous features

Retaining 99.0% variance with 74 components from PCA (out of a total of 116 features)
Retaining 99.0% variance with 12 components from PCA (out of a total of 14 features)


### We now see the CV scored on reduced dimensions features


In [13]:
results_categorical_only = benchmark_xgb(X_cat_reduced, Y, num_folds=2)
print("Mean MAE with reduced categorical features: ", results_categorical_only['test-mae-mean'].mean())

results_continuous_only = benchmark_xgb(X_cont_reduced, Y, num_folds=2)
print("Mean MAE with reduced continuous features: ", results_continuous_only['test-mae-mean'].mean())

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Mean MAE with reduced categorical features:  1367.4354515439989
Mean MAE with reduced continuous features:  1928.2954844269982


# <br>  
### That did not work out well enough, we use some other tests to discard a few features.


### Using f_regression and mutual_information_regression from sklearn to get scores for features. The features that do not score well on both these tests will be discarded.

In [71]:
    
from sklearn.feature_selection     import    f_regression, mutual_info_regression

# f_regression
##############
f_reg_res = {}
fval, pval = f_regression(X, Y)
for i,c in enumerate(X.columns):
  f_reg_res[c] = fval[i]

# sort the features according to f_regression scores
sorted_res = [[k,v] for k, v in sorted(f_reg_res.items(), key=lambda item: item[1])]
sorted(sorted_res, key = lambda x: x[1])

# remove features that scored too low
high_score_features_F = [x[0] for x in list(filter(lambda x: x[1]>100, sorted_res))]
print("features with f_regression score > 100")
print(high_score_features_F)



# mutual_information
####################
# sampling a subset of data, as mutual_info calculation is intensive
sample = train_data.sample(10000)
x = sample.iloc[:,:-1]
y = sample.iloc[:,-1]

mutinf_res = {}
mi = mutual_info_regression(x, y)
for i,c in enumerate(X.columns):
  mutinf_res[c] = mi[i]

# sort the features according to mutual_information scores
sorted_res = [[k,v] for k, v in sorted(mutinf_res.items(), key=lambda item: item[1])]
sorted(sorted_res, key = lambda x: x[1])

# remove features that scored too low
high_score_features_MI = [x[0] for x in list(filter(lambda x: x[1]>0.001, sorted_res))]
print("features with mutual_information score > 100")
print(high_score_features_MI)


# get intersection of features which score high on both of these tests
# i.e. we are discarding features that did not do well in both the tests
common_features_union = list(set(high_score_features_F).union(set(high_score_features_MI)))
print("# feautres selected: ", common_features_union.__len__())


features with f_regression score > 100
['cat32', 'cat49', 'cat114', 'cat112', 'cat61', 'cont8', 'cat20', 'cat34', 'cat52', 'cat104', 'cat83', 'cat116', 'cat99', 'cat51', 'cat19', 'cat47', 'cont4', 'cat58', 'cat67', 'cont6', 'cat18', 'cat84', 'cat59', 'cat33', 'cat95', 'cat46', 'cat43', 'cat44', 'cat30', 'cat53', 'cat26', 'cat78', 'cat66', 'cat100', 'cat65', 'cat71', 'cat106', 'cat45', 'cat75', 'cat17', 'cat85', 'cat29', 'cat102', 'cat8', 'cat41', 'cat76', 'cat25', 'cat24', 'cat94', 'cat38', 'cont12', 'cont11', 'cat14', 'cat82', 'cat4', 'cat5', 'cat50', 'cont3', 'cat105', 'cat6', 'cont7', 'cat28', 'cat40', 'cont2', 'cat111', 'cat103', 'cat73', 'cat36', 'cat23', 'cat90', 'cat16', 'cat3', 'cat9', 'cat13', 'cat1', 'cat11', 'cat72', 'cat2', 'cat81', 'cat89', 'cat7', 'cat10', 'cat12', 'cat57', 'cat87', 'cat101', 'cat79', 'cat80']
features with mutual_information score > 100
['cat18', 'cat19', 'cat52', 'cat72', 'cat48', 'cat46', 'cat47', 'cat31', 'cat79', 'cat89', 'cat97', 'cat42', 'cat67', '

### MAE scores with chosen features

In [14]:
results = benchmark_xgb(X[common_features_union], Y, num_folds=2)
print("Mean MAE with chosen feautures: ", results['test-mae-mean'].mean())


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Mean MAE with chosen feautures:  1312.4079062540006


## This is by far the best MAE we got with any subset of features.  
## We will use this subset of 116 out of 130 total feautres to train a MLP and see how it goes.

## Using MLP with union of features that scored high in the 2 tests.


In [72]:
print("Features that would be used: ", common_features_union)
print("# features: ", common_features_union.__len__())

Features that would be used:  ['cat39', 'cat97', 'cat115', 'cat78', 'cat34', 'cat23', 'cat1', 'cat51', 'cat110', 'cat91', 'cat75', 'cont4', 'cat81', 'cat50', 'cat53', 'cat25', 'cat87', 'cat49', 'cat33', 'cat46', 'cat109', 'cat19', 'cat77', 'cat29', 'cat107', 'cat48', 'cat65', 'cat100', 'cont11', 'cat96', 'cat74', 'cat102', 'cat82', 'cat32', 'cat104', 'cont6', 'cat14', 'cat10', 'cat59', 'cat88', 'cat76', 'cat16', 'cat4', 'cat11', 'cat31', 'cat111', 'cat61', 'cat80', 'cat84', 'cat90', 'cat89', 'cat79', 'cat44', 'cat57', 'cont14', 'cat9', 'cont8', 'cat113', 'cat28', 'cont12', 'cat101', 'cat86', 'cat47', 'cat93', 'cat103', 'cat40', 'cat36', 'cat24', 'cat38', 'cat67', 'cat12', 'cat66', 'cat5', 'cat15', 'cat85', 'cont2', 'cat13', 'cat112', 'cat114', 'cat43', 'cat73', 'cat92', 'cat2', 'cat54', 'cat6', 'cat45', 'cont5', 'cat41', 'cat42', 'cat7', 'cat37', 'cat58', 'cat105', 'cat3', 'cat95', 'cat83', 'cat98', 'cat116', 'cat27', 'cat20', 'cat30', 'cat71', 'cat17', 'cat52', 'cont7', 'cont3', 'cont

In [0]:

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LeakyReLU
from keras.preprocessing import text
from keras import utils
from tensorflow.nn import leaky_relu

# set hyperparameters for MLP
class NN:
    def __init__(self):
        self.in_shape = common_features_union.__len__()
        self.num_layers = 3
        self.nodes = [2048,1024, 1]
        self.activations = ['relu', 'relu', 'relu']
        self.dropouts = [0.2,0.15,0]
        self.loss = 'mean_squared_logarithmic_error'
        self.optimizer = keras.optimizers.RMSprop(0.001)



def sequential_MLP(nn):
    model = Sequential()
    for i in range(nn.num_layers):
        if i==0: # add input shape if first layer
            model.add(Dense(nn.nodes[i], activation=nn.activations[i], input_shape=(nn.in_shape,) ))
        else:
            model.add(Dense(nn.nodes[i], activation=nn.activations[i]))
        if(nn.dropouts[i] != 0): # skip adding dropout if dropout == 0
            model.add(Dropout(rate=nn.dropouts[i]))            
    model.compile(optimizer=nn.optimizer, loss=nn.loss, metrics=['mae'])

    return model


In [79]:


nn = NN()
model = sequential_MLP(nn)


for i in range(71):
  if i%10 == 0: verbose=True
  else: verbose = False
  model.fit(X[common_features_union], Y, epochs=1, batch_size=512, validation_split=0.25, verbose=verbose)

Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1


In [0]:
test_predictions = model.predict(test_data[common_features_union])
submission['loss'] = test_predictions
submission.to_csv(objects_dir+'submission_5.csv', index=False)

### This model got me a MAE of 1160 on the public test data.