In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Ins').getOrCreate()

# Importing Libraries

In [3]:
import numpy as np

from pyspark.ml.feature import StringIndexer, OneHotEncoder

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler, StandardScaler
from pyspark.ml import Pipeline

In [4]:
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

import time

# Exploring the dataset

In [5]:
df = spark.read.csv('insurance.csv' , header=True , inferSchema=True)

In [6]:
df.show()

+---+------+------+--------+------+---------+-----------+
|age|   sex|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

# Feature Engineering  

In [7]:

index = [StringIndexer(inputCol="sex" , outputCol="index_sex"),
        StringIndexer(inputCol="smoker" , outputCol="index_smoke"),
        StringIndexer(inputCol="region" , outputCol="index_region") 
        ]

In [8]:
pipe = Pipeline(stages=index)
Dfnew = pipe.fit(df).transform(df)

In [11]:
assemble = VectorAssembler(inputCols=["age" , "bmi" , "index_sex" , "index_smoke"] , outputCol="comp_feat")

In [12]:
compiled = assemble.transform(Dfnew)

In [17]:
Dffinal = compiled.select("comp_feat" , "charges")

In [18]:
Train_df, Test_df = Dffinal.randomSplit([0.7,0.3])

In [19]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="comp_feat" , labelCol="charges")

In [20]:
regressor = lr.fit(Train_df)

In [21]:
predict = regressor.evaluate(Test_df)

# Model Evaluation 

In [22]:
from pyspark.ml.evaluation import RegressionEvaluator

In [23]:
model_evaluation = RegressionEvaluator(labelCol="charges" , predictionCol="predict")

In [24]:
trainingSummary = regressor.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RSME: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

numIterations: 1
objectiveHistory: [0.0]
+-------------------+
|          residuals|
+-------------------+
| 1651.3056251910098|
| -9734.225757227308|
|   11570.9221281807|
|  977.9243111982626|
|  909.7623742703913|
| 1198.8826810974888|
| -9656.782977824323|
| 228.41537416125811|
| 18.552871790898052|
| 490.71115061019145|
|-223.25567417732805|
| -7575.620589289607|
| -8688.866199898363|
|-108.89599901941119|
|  8889.373139716856|
| -730.9560652461048|
|  4712.883895165788|
|   -1207.7976120531|
| -708.5031486490134|
| -1831.511710189551|
+-------------------+
only showing top 20 rows

RSME: 6304.551817
r2: 0.734373
