In [None]:
%%bash
pip install pyspark

if [[ ! -f ./train.csv ]]; then 
   wget https://raw.githubusercontent.com/aatishsuman/health-insurance-cross-sell-prediction/main/data/train.csv
fi

if [[ ! -f ./test.csv ]]; then 
   wget https://raw.githubusercontent.com/aatishsuman/health-insurance-cross-sell-prediction/main/data/test.csv  
fi

Collecting pyspark
  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
Collecting py4j==0.10.9
  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612243 sha256=4feed73bdd008aead55b907de7d2b29b0b137c41363040d79a3dde99c19106fa
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


--2020-11-17 16:01:40--  https://raw.githubusercontent.com/aatishsuman/health-insurance-cross-sell-prediction/main/data/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21432357 (20M) [text/plain]
Saving to: ‘train.csv’

     0K .......... .......... .......... .......... ..........  0% 3.52M 6s
    50K .......... .......... .......... .......... ..........  0% 9.90M 4s
   100K .......... .......... .......... .......... ..........  0% 3.98M 4s
   150K .......... .......... .......... .......... ..........  0% 16.4M 4s
   200K .......... .......... .......... .......... ..........  1% 5.71M 4s
   250K .......... .......... .......... .......... ..........  1% 27.2M 3s
   300K .......... .......... .......... .......... ..........  1% 30.8M 3s
   350K ...

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as fn
from pyspark.ml import feature, Pipeline, regression, classification, evaluation
import numpy as np
import pandas as pd

spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

In [None]:
train = spark.read.csv('train.csv', inferSchema=True, header=True)
test = spark.read.csv('test.csv', inferSchema=True, header=True)

print(train.toPandas().shape, test.toPandas().shape)

(381109, 12) (127037, 11)


In [None]:
train.toPandas().head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [None]:
numerical_columns=['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
categorical_columns=['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Response']

In [None]:
feature_engineering_pipe = Pipeline(stages=[feature.StringIndexerModel.from_labels(['Male', 'Female'], inputCol='Gender', outputCol='Gender_Feature'), 
                                            feature.StringIndexerModel.from_labels(['< 1 Year', '1-2 Year', '> 2 Years'], inputCol='Vehicle_Age', outputCol='Vehicle_Age_Feature'), 
                                            feature.StringIndexerModel.from_labels(['No', 'Yes'], inputCol='Vehicle_Damage', outputCol='Vehicle_Damage_Feature')])
train_xformed = feature_engineering_pipe.fit(train).transform(train).select(numerical_columns + 
                                                                            ['Driving_License', 'Previously_Insured', 'Response', 
                                                                             fn.col('Gender_Feature').alias('Gender'), 
                                                                             fn.col('Vehicle_Age_Feature').alias('Vehicle_Age'), 
                                                                             fn.col('Vehicle_Damage_Feature').alias('Vehicle_Damage')])

In [None]:
train_xformed.toPandas().head()

Unnamed: 0,Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage,Driving_License,Previously_Insured,Response,Gender,Vehicle_Age,Vehicle_Damage
0,44,28.0,40454.0,26.0,217,1,0,1,0.0,2.0,1.0
1,76,3.0,33536.0,26.0,183,1,0,0,0.0,1.0,0.0
2,47,28.0,38294.0,26.0,27,1,0,1,0.0,2.0,1.0
3,21,11.0,28619.0,152.0,203,1,1,0,0.0,0.0,0.0
4,29,41.0,27496.0,152.0,39,1,1,0,1.0,0.0,0.0


You can pick the features by either just looking at the p-values (from Part 1) or using the step-wise selection method which first sorts the features by p-values and checks if the validation MSE improves by removing the feature with the highest p-value as a recursive step (from Part 2). Also refer to the EDA notebook - https://github.com/aatishsuman/health-insurance-cross-sell-prediction/blob/main/Exploratory_Analysis.ipynb.

In [None]:
# Part 1
pipe = Pipeline(stages=[feature.VectorAssembler(inputCols=numerical_columns + categorical_columns[:-1], outputCol='features'), regression.LinearRegression(labelCol='Response')])
model = pipe.fit(train_xformed)
p_values = dict(zip(model.stages[-2].getInputCols(), model.stages[-1].summary.pValues))
features_df = pd.DataFrame(list({k: p_values[k] for k in sorted(p_values, key=p_values.get)}.items()), columns=['feature', 'p_values'])
features_df

Unnamed: 0,feature,p_values
0,Age,0.0
1,Annual_Premium,0.0
2,Policy_Sales_Channel,0.0
3,Driving_License,0.0
4,Previously_Insured,0.0
5,Vehicle_Age,0.0
6,Vehicle_Damage,0.0
7,Gender,1.432632e-12
8,Region_Code,0.2226673
9,Vintage,0.9445091


In [None]:
# Part 2
def get_mse(features):
  train_df, validation_df = train_xformed.randomSplit([0.9, 0.1], 42)
  pipe = Pipeline(stages=[feature.VectorAssembler(inputCols=features, outputCol='features'), regression.LinearRegression(labelCol='Response')])
  evaluator = evaluation.RegressionEvaluator(labelCol='Response', metricName='mse')
  model = pipe.fit(train_df)
  return evaluator.evaluate(model.transform(validation_df)), dict(zip(model.stages[-2].getInputCols(), model.stages[-1].summary.pValues))

def get_stepwise_pred_list():
  predictors = numerical_columns + categorical_columns[:-1]
  while (len(predictors) > 1):
    initial_mse, p_values = get_mse(predictors) 
    predictors = list(sorted(p_values, key=p_values.get, reverse=True))
    predictors.pop(0)
    mse, p_values = get_mse(predictors)
    predictors = list(sorted(p_values, key=p_values.get, reverse=True))
    if (mse >= initial_mse):
      return predictors[::-1]
  return predictors[::-1]

best_predictors = get_stepwise_pred_list()
best_predictors

['Vehicle_Damage',
 'Vehicle_Age',
 'Previously_Insured',
 'Driving_License',
 'Policy_Sales_Channel',
 'Annual_Premium',
 'Age']

In [None]:
# Part 3
pipe = Pipeline(stages=[feature.VectorAssembler(inputCols=numerical_columns + categorical_columns[:-1], outputCol='features'), classification.RandomForestClassifier(labelCol='Response')])
model = pipe.fit(train_xformed)
feature_importances = dict(zip(model.stages[-2].getInputCols(), model.stages[-1].featureImportances))
features_df = pd.DataFrame(list({k: feature_importances[k] for k in sorted(feature_importances, key=feature_importances.get, reverse=True)}.items()), columns=['feature', 'feature_importances'])
features_df

Unnamed: 0,feature,feature_importances
0,Vehicle_Damage,0.41999
1,Age,0.280382
2,Policy_Sales_Channel,0.140696
3,Vehicle_Age,0.079464
4,Previously_Insured,0.075979
5,Region_Code,0.003271
6,Vintage,0.000178
7,Annual_Premium,4e-05
8,Gender,0.0
9,Driving_License,0.0
