In [1]:
%%bash
pip install pyspark

if [[ ! -f ./train.csv ]]; then 
   wget https://raw.githubusercontent.com/aatishsuman/health-insurance-cross-sell-prediction/main/data/train.csv
fi

if [[ ! -f ./test.csv ]]; then 
   wget https://raw.githubusercontent.com/aatishsuman/health-insurance-cross-sell-prediction/main/data/test.csv  
fi

Collecting pyspark
  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
Collecting py4j==0.10.9
  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612243 sha256=08a76dccf03ad677259107ce2ac0a51af5fc89591527e5e9ed99b8a1de54706b
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


--2020-11-29 17:21:28--  https://raw.githubusercontent.com/aatishsuman/health-insurance-cross-sell-prediction/main/data/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21432357 (20M) [text/plain]
Saving to: ‘train.csv’

     0K .......... .......... .......... .......... ..........  0% 3.85M 5s
    50K .......... .......... .......... .......... ..........  0% 10.4M 4s
   100K .......... .......... .......... .......... ..........  0% 4.44M 4s
   150K .......... .......... .......... .......... ..........  0% 22.7M 3s
   200K .......... .......... .......... .......... ..........  1% 5.99M 3s
   250K .......... .......... .......... .......... ..........  1% 21.8M 3s
   300K .......... .......... .......... .......... ..........  1% 28.9M 3s
   350K ...

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import feature
from pyspark.ml import classification
from pyspark.sql import functions as fn
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, \
    MulticlassClassificationEvaluator, \
    RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.linalg import DenseVector
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import subprocess
from pyspark.sql import SparkSession
import glob

spark = SparkSession.builder.master("local").appName("mlpc_example").config("spark.master", "local[*]").getOrCreate()
sc = spark.sparkContext

%matplotlib inline

In [3]:
from pyspark.ml import feature
from pyspark.ml import classification
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [4]:
train = spark.read.csv('train.csv', inferSchema=True, header=True)
test = spark.read.csv('test.csv', inferSchema=True, header=True)

print(train.toPandas().shape, test.toPandas().shape)

(381109, 12) (127037, 11)


In [5]:
train.toPandas().head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [6]:
numerical_columns=['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
categorical_columns=['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Response']

In [7]:
feature_engineering_pipe = Pipeline(stages=[feature.StringIndexerModel.from_labels(['Male', 'Female'], inputCol='Gender', outputCol='Gender_Feature'), 
                                            feature.StringIndexerModel.from_labels(['< 1 Year', '1-2 Year', '> 2 Years'], inputCol='Vehicle_Age', outputCol='Vehicle_Age_Feature'), 
                                            feature.StringIndexerModel.from_labels(['No', 'Yes'], inputCol='Vehicle_Damage', outputCol='Vehicle_Damage_Feature')])
train_xformed = feature_engineering_pipe.fit(train).transform(train).select(numerical_columns + 
                                                                            ['Driving_License', 'Previously_Insured', 'Response', 
                                                                             fn.col('Gender_Feature').alias('Gender'), 
                                                                             fn.col('Vehicle_Age_Feature').alias('Vehicle_Age'), 
                                                                             fn.col('Vehicle_Damage_Feature').alias('Vehicle_Damage')])

In [8]:
train_xformed.toPandas().head()

Unnamed: 0,Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage,Driving_License,Previously_Insured,Response,Gender,Vehicle_Age,Vehicle_Damage
0,44,28.0,40454.0,26.0,217,1,0,1,0.0,2.0,1.0
1,76,3.0,33536.0,26.0,183,1,0,0,0.0,1.0,0.0
2,47,28.0,38294.0,26.0,27,1,0,1,0.0,2.0,1.0
3,21,11.0,28619.0,152.0,203,1,1,0,0.0,0.0,0.0
4,29,41.0,27496.0,152.0,39,1,1,0,1.0,0.0,0.0


In [9]:
test_xformed = feature_engineering_pipe.fit(train).transform(test).select(numerical_columns + 
                                                                            ['Driving_License', 'Previously_Insured', 
                                                                             fn.col('Gender_Feature').alias('Gender'), 
                                                                             fn.col('Vehicle_Age_Feature').alias('Vehicle_Age'), 
                                                                             fn.col('Vehicle_Damage_Feature').alias('Vehicle_Damage')])

In [10]:
from pyspark.ml import feature, Pipeline, regression, classification, evaluation, tuning
def get_mse(features):
  train_df, validation_df = train_xformed.randomSplit([0.9, 0.1], 42)
  pipe = Pipeline(stages=[feature.VectorAssembler(inputCols=features, outputCol='features'), regression.LinearRegression(labelCol='Response')])
  evaluator = evaluation.RegressionEvaluator(labelCol='Response', metricName='mse')
  model = pipe.fit(train_df)
  return evaluator.evaluate(model.transform(validation_df)), dict(zip(model.stages[-2].getInputCols(), model.stages[-1].summary.pValues))

def get_stepwise_pred_list():
  predictors = numerical_columns + categorical_columns[:-1]
  while (len(predictors) > 1):
    initial_mse, p_values = get_mse(predictors) 
    predictors = list(sorted(p_values, key=p_values.get, reverse=True))
    predictors.pop(0)
    mse, p_values = get_mse(predictors)
    predictors = list(sorted(p_values, key=p_values.get, reverse=True))
    if (mse >= initial_mse):
      return predictors[::-1]
  return predictors[::-1]

best_predictors = get_stepwise_pred_list()
best_predictors

['Vehicle_Damage',
 'Vehicle_Age',
 'Previously_Insured',
 'Driving_License',
 'Policy_Sales_Channel',
 'Annual_Premium',
 'Age']

In [11]:
# getting baseline
train_xformed.groupBy('Response').count().withColumn('class percentage', fn.col('count') / train_xformed.count()).show()

+--------+------+-------------------+
|Response| count|   class percentage|
+--------+------+-------------------+
|       1| 46710|0.12256336113815208|
|       0|334399|  0.877436638861848|
+--------+------+-------------------+



In [12]:
# using best predictors
nn_train = train_xformed.withColumnRenamed('Response', 'label')
train_df, validation_df = nn_train.randomSplit([0.9, 0.1], 42)
nn_test = train_xformed.withColumnRenamed('Response', 'label')

In [13]:
print(train_df.toPandas().shape, validation_df.toPandas().shape)

(342678, 11) (38431, 11)


In [14]:
va = feature.VectorAssembler(inputCols=best_predictors, outputCol='features')
sc = feature.StandardScaler(withMean=True, inputCol='features')

In [15]:
## No hidden layer ##
model1 = classification.MultilayerPerceptronClassifier(seed=42).\
  setFeaturesCol('features').\
  setLayers([len(best_predictors),2]).\
  setLabelCol('label').\
  setMaxIter(100).\
  setBlockSize(32).\
  setTol(0.00000001).\
  setStepSize(0.02)

In [16]:
nn_pipe1 = Pipeline(stages=[va,sc, model1])

In [17]:
mlp_model1=nn_pipe1.fit(train_df)
evaluator_nn = evaluation.BinaryClassificationEvaluator(labelCol='label')
AUC1 = evaluator_nn.evaluate(mlp_model1.transform(validation_df))
print( 'AUC of Validation_df mlp_model1 WITH NO HIDDEN Layers:', AUC1 )
eval_accuracy = evaluation.MulticlassClassificationEvaluator(metricName="accuracy")
print( 'Accuracy of Validation_df mlp_model1 WITH NO HIDDEN Layers:', eval_accuracy.evaluate(mlp_model1.transform(validation_df)))
print( 'AUC of Test mlp_model1 WITH NO HIDDEN Layers:', evaluator_nn.evaluate(mlp_model1.transform(nn_test)))
print( 'Accuracy of Test mlp_model1 WITH NO HIDDEN Layers:', eval_accuracy.evaluate(mlp_model1.transform(nn_test)))

AUC of Validation_df mlp_model1 WITH NO HIDDEN Layers: 0.5271143819065076
Accuracy of Validation_df mlp_model1 WITH NO HIDDEN Layers: 0.12185475267362286
AUC of Test mlp_model1 WITH NO HIDDEN Layers: 0.5294172899503613
Accuracy of Test mlp_model1 WITH NO HIDDEN Layers: 0.12256336113815208


In [18]:
model2 = classification.MultilayerPerceptronClassifier(seed=42).\
    setFeaturesCol('features').\
    setLayers([len(best_predictors),4,2]).\
    setLabelCol('label').\
    setMaxIter(100).\
    setBlockSize(128).\
    setStepSize(0.02).\
    setTol(0.00000001)

In [19]:
nn_pipe2 = Pipeline(stages=[va,sc,model2])
mlp_model2=nn_pipe2.fit(train_df)
AUC2 = evaluator_nn.evaluate(mlp_model2.transform(validation_df))
print( 'AUC of mlp_model2 WITH One hidden layer with 4 neurons:' ,AUC2 )
print( 'Accuracy of mlp_model2 WITH One Hidden Layer:', eval_accuracy.evaluate(mlp_model2.transform(validation_df)))
print( 'AUC of Test mlp_model2 WITH One Hidden Layers:', evaluator_nn.evaluate(mlp_model2.transform(nn_test)))
print( 'Accuracy of Test mlp_model2 WITH One Hidden Layers:', eval_accuracy.evaluate(mlp_model2.transform(nn_test)))

AUC of mlp_model2 WITH One hidden layer with 4 neurons: 0.49987266350229037
Accuracy of mlp_model2 WITH One Hidden Layer: 0.8781452473263771
AUC of Test mlp_model2 WITH One Hidden Layers: 0.49994782636878426
Accuracy of Test mlp_model2 WITH One Hidden Layers: 0.877436638861848


In [20]:
model3 = classification.MultilayerPerceptronClassifier(seed=42).\
    setFeaturesCol('features').\
    setLayers([len(best_predictors), 5, 5, 2]).\
    setLabelCol('label').\
    setMaxIter(100).\
    setBlockSize(32).\
    setStepSize(0.02).\
    setTol(0.00000001)

nn_pipe3 = Pipeline(stages=[va, sc, model3])
mlp_model3=nn_pipe3.fit(train_df)
AUC3 = evaluator_nn.evaluate(mlp_model3.transform(validation_df))
print( 'AUC of mlp_model WITH Two hidden layers with 5, 5 neurons:' ,AUC3 )
print( 'Accuracy of mlp_model3 WITH Two Hidden Layers:', eval_accuracy.evaluate(mlp_model3.transform(validation_df)))
print( 'AUC of Test mlp_model3 WITH Two Hidden Layers:', evaluator_nn.evaluate(mlp_model1.transform(nn_test)))
print( 'Accuracy of Test mlp_model3 Two Hidden Layers:', eval_accuracy.evaluate(mlp_model1.transform(nn_test)))

AUC of mlp_model WITH Two hidden layers with 5, 5 neurons: 0.5
Accuracy of mlp_model3 WITH Two Hidden Layers: 0.8781452473263771
AUC of Test mlp_model3 WITH Two Hidden Layers: 0.529411491404177
Accuracy of Test mlp_model3 Two Hidden Layers: 0.12256336113815208


In [21]:
mlp_model3.transform(nn_test).show()

+---+-----------+--------------+--------------------+-------+---------------+------------------+-----+------+-----------+--------------+--------------------+-----------------------------------+--------------------+--------------------+----------+
|Age|Region_Code|Annual_Premium|Policy_Sales_Channel|Vintage|Driving_License|Previously_Insured|label|Gender|Vehicle_Age|Vehicle_Damage|            features|StandardScaler_2e3874ee59fc__output|       rawPrediction|         probability|prediction|
+---+-----------+--------------+--------------------+-------+---------------+------------------+-----+------+-----------+--------------+--------------------+-----------------------------------+--------------------+--------------------+----------+
| 44|       28.0|       40454.0|                26.0|    217|              1|                 0|    1|   0.0|        2.0|           1.0|[1.0,2.0,0.0,1.0,...|               [0.99013699997296...|[0.64696547232313...|[0.87736074793534...|       0.0|
| 76|       

In [22]:
enable_grid = True #Change to True for performing Grid search
if enable_grid:
  from pyspark.ml.classification import MultilayerPerceptronClassifier
  #Layers combinations
  layer1 = [7, 14, 5 ,2]
  layer2 = [7, 2]
  layer3 = [7, 5, 5, 2]

  NeuralBaseModel = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", maxIter=100, layers=layer1, blockSize=32, seed=42, tol=0.00000001, stepSize=0.02)
  pop_response_pipe_grid = Pipeline(stages=[va, sc, NeuralBaseModel])

  grid = ParamGridBuilder().\
  addGrid(NeuralBaseModel.blockSize,[16,32,64,128]).\
  addGrid(NeuralBaseModel.tol,[0.0000001, 0.00000001, 0.000000001]).\
  addGrid(NeuralBaseModel.stepSize,[0.001, 0.01, 0.1, 0.2, 0.3]).\
  addGrid(NeuralBaseModel.layers, [layer1, layer2, layer3]).\
  build()



In [23]:
all_models = []

for j in range(len(grid)):
  print("Fitting model {}".format(j+1))
  model = pop_response_pipe_grid.fit(nn_train,grid[j])
  all_models.append(model)



Fitting model 1
Fitting model 2
Fitting model 3
Fitting model 4
Fitting model 5
Fitting model 6
Fitting model 7
Fitting model 8
Fitting model 9
Fitting model 10
Fitting model 11
Fitting model 12
Fitting model 13
Fitting model 14
Fitting model 15
Fitting model 16
Fitting model 17
Fitting model 18
Fitting model 19
Fitting model 20
Fitting model 21
Fitting model 22
Fitting model 23
Fitting model 24
Fitting model 25
Fitting model 26
Fitting model 27
Fitting model 28
Fitting model 29
Fitting model 30
Fitting model 31
Fitting model 32
Fitting model 33
Fitting model 34
Fitting model 35
Fitting model 36
Fitting model 37
Fitting model 38
Fitting model 39
Fitting model 40
Fitting model 41
Fitting model 42
Fitting model 43
Fitting model 44
Fitting model 45
Fitting model 46
Fitting model 47
Fitting model 48
Fitting model 49
Fitting model 50
Fitting model 51
Fitting model 52
Fitting model 53
Fitting model 54
Fitting model 55
Fitting model 56
Fitting model 57
Fitting model 58
Fitting model 59
Fittin

In [24]:
accuracies = [z.\
transform(validation_df).\
select(fn.avg(fn.expr('float(label = prediction)')).alias('accuracy')).\
first().\
accuracy for z in all_models]
  

In [25]:
import numpy as np

best_model_idx = np.argmax(accuracies)
print("Best model index :", best_model_idx)
print("Best model:", grid[best_model_idx])

best_model = all_models[best_model_idx]

best_model.\
transform(nn_test).\
select(fn.avg(fn.expr('float(label = prediction)')).alias('accuracy')).\
show()

pass

Best model index : 0
Best model: {Param(parent='MultilayerPerceptronClassifier_cec19731d2ce', name='blockSize', doc='block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data.'): 16, Param(parent='MultilayerPerceptronClassifier_cec19731d2ce', name='tol', doc='the convergence tolerance for iterative algorithms (>= 0).'): 1e-07, Param(parent='MultilayerPerceptronClassifier_cec19731d2ce', name='stepSize', doc='Step size to be used for each iteration of optimization (>= 0).'): 0.001, Param(parent='MultilayerPerceptronClassifier_cec19731d2ce', name='layers', doc='Sizes of layers from input layer to output layer E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 neurons and output layer of 10 neurons.'): [7, 14, 5, 2]}
+-----------------+
|         accuracy|
+-----------------+
|0.877436638861848|
+-----------------+



In [26]:
modelbest = classification.MultilayerPerceptronClassifier(seed=42).\
    setFeaturesCol('features').\
    setLayers([len(best_predictors), 14, 5, 2]).\
    setLabelCol('label').\
    setMaxIter(100).\
    setBlockSize(16).\
    setStepSize(0.001).\
    setTol(1e-07)

nn_modelbest = Pipeline(stages=[va, sc, modelbest])
mlp_modelbest=nn_modelbest.fit(train_df)
AUC_modelbest = evaluator_nn.evaluate(mlp_modelbest.transform(validation_df))
print( 'AUC of mlp_model WITH Two hidden layers with 14, 5 neurons:' ,AUC_modelbest )
print( 'Accuracy of mlp_model3 WITH Two Hidden Layers:', eval_accuracy.evaluate(mlp_modelbest.transform(validation_df)))
print( 'AUC of Test mlp_model3 WITH Two Hidden Layers:', evaluator_nn.evaluate(mlp_modelbest.transform(nn_test)))
print( 'Accuracy of Test mlp_model3 Two Hidden Layers:', eval_accuracy.evaluate(mlp_modelbest.transform(nn_test)))

AUC of mlp_model WITH Two hidden layers with 14, 5 neurons: 0.49914305311622326
Accuracy of mlp_model3 WITH Two Hidden Layers: 0.8781452473263771
AUC of Test mlp_model3 WITH Two Hidden Layers: 0.49902749800986446
Accuracy of Test mlp_model3 Two Hidden Layers: 0.877436638861848


In [27]:
mlp_modelbest.transform(nn_test).show()

+---+-----------+--------------+--------------------+-------+---------------+------------------+-----+------+-----------+--------------+--------------------+-----------------------------------+--------------------+--------------------+----------+
|Age|Region_Code|Annual_Premium|Policy_Sales_Channel|Vintage|Driving_License|Previously_Insured|label|Gender|Vehicle_Age|Vehicle_Damage|            features|StandardScaler_2e3874ee59fc__output|       rawPrediction|         probability|prediction|
+---+-----------+--------------+--------------------+-------+---------------+------------------+-----+------+-----------+--------------+--------------------+-----------------------------------+--------------------+--------------------+----------+
| 44|       28.0|       40454.0|                26.0|    217|              1|                 0|    1|   0.0|        2.0|           1.0|[1.0,2.0,0.0,1.0,...|               [0.99013699997296...|[0.68412645156291...|[0.87736071017485...|       0.0|
| 76|       