#Heart Attack classification based on Indicators

Building a classification model to predict whether a person will have heartattack or not using PySpark Databricks
 


#Mounting my Storage and uploading dataset into container

In [None]:
# Mount Blob to DBFS
dbutils.fs.mount(
  source = "wasbs://datawarehouse@dsba6190storagegamma.blob.core.windows.net/",
  mount_point = "/mnt/dsba6190-gamma-rg/",
  extra_configs = 
  {"fs.azure.account.key.dsba6190storagegamma.blob.core.windows.net":"Rj44tI7HgH2BzVuCK8BeUYjHsip7j+jjUJOOEJOljbwgSQQ4pX/8mvRqqEMWhwyLGtPOOHWRg9Ev+ASttotkmg=="}
)

#Reading data as a Spark dataframe.

In [None]:
dataset = sqlContext.read.format('csv') \
                    .options(header='true', inferSchema='true', delimiter= ',') \
                    .load('/mnt/dsba6190-gamma-rg/heart_2020_cleaned.csv')


#Shaping Data for Machine Learning

Pre-processing - Encoding, Vectorization and so on and making pipelines to make operationalization of my code easier

In [None]:
dataset.show()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer 
from pyspark.ml.feature import VectorAssembler

label = "HeartDisease"
categoricalColumns = ["Smoking",
                     "AlcoholDrinking",
                     "Stroke",
                     "DiffWalking",
                     "Sex",
                     "AgeCategory",
                     "Race",
                     "Diabetic",
                     "PhysicalActivity",
                     "GenHealth",
                     "Asthma",
                     "KidneyDisease",
                      "SkinCancer"]

numericalColumns = ["BMI",
                    "PhysicalHealth",
                    "MentalHealth",
                    "SleepTime"]

#categoricalColumnsclassVec = ["col1classVec",
#                              "col2classVec"]
categoricalColumnsclassVec = [c + "classVec" for c in categoricalColumns]

In [None]:
stages = []
for categoricalColumn in categoricalColumns:
  print(categoricalColumn)
  ## Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalColumn, outputCol = categoricalColumn+"Index").setHandleInvalid("skip")
  ## Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalColumn+"Index", outputCol=categoricalColumn+"classVec")
  ## Add stages
  stages += [stringIndexer, encoder]

Smoking
AlcoholDrinking
Stroke
DiffWalking
Sex
AgeCategory
Race
Diabetic
PhysicalActivity
GenHealth
Asthma
KidneyDisease
SkinCancer


In [None]:
len(stages)

Out[34]: 26

In [None]:
## Convert label into label indices using the StringIndexer
label_stringIndexer = StringIndexer(inputCol = label, outputCol = "label").setHandleInvalid("skip")
stages += [label_stringIndexer]

In [None]:
stages

Out[36]: [StringIndexer_50f2b357c4a8,
 OneHotEncoder_b10524bf886a,
 StringIndexer_7ddf8bb3d4a5,
 OneHotEncoder_93a3226b4ed4,
 StringIndexer_d28866c56f09,
 OneHotEncoder_276c5ac3445f,
 StringIndexer_16c305e82b92,
 OneHotEncoder_75a36dc5561d,
 StringIndexer_ba9ddecadbac,
 OneHotEncoder_3e487577e02c,
 StringIndexer_c1713174c6d2,
 OneHotEncoder_792cc35c3036,
 StringIndexer_2bf0ea0ddcec,
 OneHotEncoder_3878ae36ea7b,
 StringIndexer_37859f7dd7d7,
 OneHotEncoder_5fcbaae6bbce,
 StringIndexer_64b54f7cce16,
 OneHotEncoder_ccf58d5168d4,
 StringIndexer_a2f0178ecc26,
 OneHotEncoder_918dacb40dee,
 StringIndexer_e744d12f177a,
 OneHotEncoder_6f1ffc0f1f73,
 StringIndexer_188c066f488b,
 OneHotEncoder_ebcf9a9e1f0e,
 StringIndexer_bdd58ff03360,
 OneHotEncoder_3de95f934c43,
 StringIndexer_bd65c99d5506]

In [None]:
assemblerInputs = categoricalColumnsclassVec + numericalColumns
assembler = VectorAssembler(inputCols = assemblerInputs,
                            outputCol = "features")
stages += [assembler]

In [None]:
assembler

Out[38]: VectorAssembler_5da02236f26f

In [None]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol = "features",
                        outputCol = "scaledFeatures",
                        withStd = True,
                        withMean = True)
stages += [scaler]

In [None]:
prepPipeline = Pipeline().setStages(stages)
pipelineModel = prepPipeline.fit(dataset)
dataset = pipelineModel.transform(dataset)

In [None]:
from pyspark.ml import PipelineModel
pipelineModel = PipelineModel.load("/mnt/dsba6190-gamma-rg/pipeline")


In [None]:
display(dataset)

HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,SmokingIndex,SmokingclassVec,AlcoholDrinkingIndex,AlcoholDrinkingclassVec,StrokeIndex,StrokeclassVec,DiffWalkingIndex,DiffWalkingclassVec,SexIndex,SexclassVec,AgeCategoryIndex,AgeCategoryclassVec,RaceIndex,RaceclassVec,DiabeticIndex,DiabeticclassVec,PhysicalActivityIndex,PhysicalActivityclassVec,GenHealthIndex,GenHealthclassVec,AsthmaIndex,AsthmaclassVec,KidneyDiseaseIndex,KidneyDiseaseclassVec,SkinCancerIndex,SkinCancerclassVec,label,features,scaledFeatures
No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes,1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 12, indices -> List(3), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 37, indices -> List(1, 2, 3, 4, 8, 17, 23, 25, 26, 31, 33, 34, 35, 36), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 16.6, 3.0, 30.0, 5.0))","Map(vectorType -> dense, length -> 37, values -> List(-1.1934716832320853, 0.27031932768108335, 0.19803997771650705, 0.4015776806811404, 0.9517094461647004, -0.34577093195957226, -0.34312967856033766, -0.3280114805839616, 3.121994972417104, -0.293618725726171, -0.2858260422924588, -0.2704125568427208, -0.26834937448049073, -0.2655397840254846, -0.26514820975723175, -0.2620545971233728, -0.24958653540225773, 0.5515036427430173, -0.30639949428912294, -0.2779800611103095, -0.18809788366517577, -0.16087740626092414, -2.3190019983436607, 2.6149009849551113, -0.14718525748761555, 0.5382552206183824, 1.3448837201731314, -0.6409860832689794, -0.5140485302667079, -0.34874481726436696, -2.5415108194357328, 0.19555408545895056, -3.1184142825401184, -1.8447472744465305, -0.04675097545350068, 3.2810637449205613, -1.4603512374543075))"
No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No,0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 12, indices -> List(5), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 37, indices -> List(0, 1, 3, 4, 10, 17, 22, 25, 26, 30, 31, 32, 33, 36), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 20.34, 7.0))","Map(vectorType -> dense, length -> 37, values -> List(0.8378890651922588, 0.27031932768108335, -5.0494697309462095, 0.4015776806811404, 0.9517094461647004, -0.34577093195957226, -0.34312967856033766, -0.3280114805839616, -0.320307009406408, -0.293618725726171, 3.4986205769646466, -0.2704125568427208, -0.26834937448049073, -0.2655397840254846, -0.26514820975723175, -0.2620545971233728, -0.24958653540225773, 0.5515036427430173, -0.30639949428912294, -0.2779800611103095, -0.18809788366517577, -0.16087740626092414, 0.4312186335807422, -0.38242246216979797, -0.14718525748761555, 0.5382552206183824, 1.3448837201731314, -0.6409860832689794, -0.5140485302667079, -0.34874481726436696, 0.3934655187573756, 0.19555408545895056, 0.32067479891806167, -1.2563361603921912, -0.4240691151634072, -0.4900378220779533, -0.06760042559061336))"
No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No,1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 12, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 37, indices -> List(1, 2, 3, 5, 17, 23, 25, 29, 31, 32, 33, 34, 35, 36), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 26.58, 20.0, 30.0, 8.0))","Map(vectorType -> dense, length -> 37, values -> List(-1.1934716832320853, 0.27031932768108335, 0.19803997771650705, 0.4015776806811404, -1.050737572298622, 2.8920790632385596, -0.34312967856033766, -0.3280114805839616, -0.320307009406408, -0.293618725726171, -0.2858260422924588, -0.2704125568427208, -0.26834937448049073, -0.2655397840254846, -0.26514820975723175, -0.2620545971233728, -0.24958653540225773, 0.5515036427430173, -0.30639949428912294, -0.2779800611103095, -0.18809788366517577, -0.16087740626092414, -2.3190019983436607, 2.6149009849551113, -0.14718525748761555, 0.5382552206183824, -0.7435563818617944, -0.6409860832689794, -0.5140485302667079, 2.8674171586002757, -2.5415108194357328, 0.19555408545895056, 0.32067479891806167, -0.2746021091357531, 2.091385149569303, 3.2810637449205613, 0.6287749803412337))"
No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes,0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 12, indices -> List(7), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",1.0,"Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 37, indices -> List(0, 1, 2, 3, 4, 12, 17, 22, 27, 30, 31, 33, 36), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 24.21, 6.0))","Map(vectorType -> dense, length -> 37, values -> List(0.8378890651922588, 0.27031932768108335, 0.19803997771650705, 0.4015776806811404, 0.9517094461647004, -0.34577093195957226, -0.34312967856033766, -0.3280114805839616, -0.320307009406408, -0.293618725726171, -0.2858260422924588, -0.2704125568427208, 3.7264736500045914, -0.2655397840254846, -0.26514820975723175, -0.2620545971233728, -0.24958653540225773, 0.5515036427430173, -0.30639949428912294, -0.2779800611103095, -0.18809788366517577, -0.16087740626092414, 0.4312186335807422, -0.38242246216979797, -0.14718525748761555, -1.8578489064126542, -0.7435563818617944, 1.5600913952715747, -0.5140485302667079, -0.34874481726436696, 0.3934655187573756, 0.19555408545895056, -3.1184142825401184, -0.6474722536033423, -0.4240691151634072, -0.4900378220779533, -0.7639758315224605))"
No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No,0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",9.0,"Map(vectorType -> sparse, length -> 12, indices -> List(9), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 37, indices -> List(0, 1, 2, 4, 14, 17, 22, 25, 26, 30, 31, 32, 33, 34, 36), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 23.71, 28.0, 8.0))","Map(vectorType -> dense, length -> 37, values -> List(0.8378890651922588, 0.27031932768108335, 0.19803997771650705, -2.490170447970634, 0.9517094461647004, -0.34577093195957226, -0.34312967856033766, -0.3280114805839616, -0.320307009406408, -0.293618725726171, -0.2858260422924588, -0.2704125568427208, -0.26834937448049073, -0.2655397840254846, 3.7714637934472774, -0.2620545971233728, -0.24958653540225773, 0.5515036427430173, -0.30639949428912294, -0.2779800611103095, -0.18809788366517577, -0.16087740626092414, 0.4312186335807422, -0.38242246216979797, -0.14718525748761555, 0.5382552206183824, 1.3448837201731314, -0.6409860832689794, -0.5140485302667079, -0.34874481726436696, 0.3934655187573756, 0.19555408545895056, 0.32067479891806167, -0.726136841043762, 3.0975668554623867, -0.4900378220779533, 0.6287749803412337))"
Yes,28.87,Yes,No,No,6.0,0.0,Yes,Female,75-79,Black,No,No,Fair,12.0,No,No,No,1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 12, indices -> List(7), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 5, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",3.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 37, indices -> List(1, 2, 4, 12, 19, 22, 29, 30, 31, 32, 33, 34, 36), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 28.87, 6.0, 12.0))","Map(vectorType -> dense, length -> 37, values -> List(-1.1934716832320853, 0.27031932768108335, 0.19803997771650705, -2.490170447970634, 0.9517094461647004, -0.34577093195957226, -0.34312967856033766, -0.3280114805839616, -0.320307009406408, -0.293618725726171, -0.2858260422924588, -0.2704125568427208, 3.7264736500045914, -0.2655397840254846, -0.26514820975723175, -0.2620545971233728, -0.24958653540225773, -1.813218980790539, -0.30639949428912294, 3.59736906669698, -0.18809788366517577, -0.16087740626092414, 0.4312186335807422, -0.38242246216979797, -0.14718525748761555, -1.8578489064126542, -0.7435563818617944, -0.6409860832689794, -0.5140485302667079, 2.8674171586002757, 0.3934655187573756, 0.19555408545895056, 0.32067479891806167, 0.08568170134136971, 0.3305671642564058, -0.4900378220779533, 3.4142766040686223))"
No,21.63,No,No,No,15.0,0.0,No,Female,70-74,White,No,Yes,Fair,4.0,Yes,No,Yes,0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 12, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 37, indices -> List(0, 1, 2, 3, 4, 7, 17, 22, 25, 29, 31, 33, 34, 36), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 21.63, 15.0, 4.0))","Map(vectorType -> dense, length -> 37, values -> List(0.8378890651922588, 0.27031932768108335, 0.19803997771650705, 0.4015776806811404, 0.9517094461647004, -0.34577093195957226, -0.34312967856033766, 3.048664245582078, -0.320307009406408, -0.293618725726171, -0.2858260422924588, -0.2704125568427208, -0.26834937448049073, -0.2655397840254846, -0.26514820975723175, -0.2620545971233728, -0.24958653540225773, 0.5515036427430173, -0.30639949428912294, -0.2779800611103095, -0.18809788366517577, -0.16087740626092414, 0.4312186335807422, -0.38242246216979797, -0.14718525748761555, 0.5382552206183824, -0.7435563818617944, -0.6409860832689794, -0.5140485302667079, 2.8674171586002757, -2.5415108194357328, 0.19555408545895056, -3.1184142825401184, -1.0533815247959084, 1.4625215833861254, -0.4900378220779533, -2.1567266433861545))"
No,31.64,Yes,No,No,5.0,0.0,Yes,Female,80 or older,White,Yes,No,Good,9.0,Yes,No,No,1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 12, indices -> List(5), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",1.0,"Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 37, indices -> List(1, 2, 4, 10, 17, 23, 27, 31, 32, 33, 34, 36), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 31.64, 5.0, 9.0))","Map(vectorType -> dense, length -> 37, values -> List(-1.1934716832320853, 0.27031932768108335, 0.19803997771650705, -2.490170447970634, 0.9517094461647004, -0.34577093195957226, -0.34312967856033766, -0.3280114805839616, -0.320307009406408, -0.293618725726171, 3.4986205769646466, -0.2704125568427208, -0.26834937448049073, -0.2655397840254846, -0.26514820975723175, -0.2620545971233728, -0.24958653540225773, 0.5515036427430173, -0.30639949428912294, -0.2779800611103095, -0.18809788366517577, -0.16087740626092414, -2.3190019983436607, 2.6149009849551113, -0.14718525748761555, -1.8578489064126542, -0.7435563818617944, 1.5600913952715747, -0.5140485302667079, -0.34874481726436696, -2.5415108194357328, 0.19555408545895056, 0.32067479891806167, 0.521483515761295, 0.2047944510197703, -0.4900378220779533, 1.325150386273081))"
No,26.45,No,No,No,0.0,0.0,No,Female,80 or older,White,"No, borderline diabetes",No,Fair,5.0,No,Yes,No,0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 12, indices -> List(5), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",3.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 37, indices -> List(0, 1, 2, 3, 4, 10, 17, 24, 29, 30, 32, 33, 36), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 26.45, 5.0))","Map(vectorType -> dense, length -> 37, values -> List(0.8378890651922588, 0.27031932768108335, 0.19803997771650705, 0.4015776806811404, 0.9517094461647004, -0.34577093195957226, -0.34312967856033766, -0.3280114805839616, -0.320307009406408, -0.293618725726171, 3.4986205769646466, -0.2704125568427208, -0.26834937448049073, -0.2655397840254846, -0.26514820975723175, -0.2620545971233728, -0.24958653540225773, 0.5515036427430173, -0.30639949428912294, -0.2779800611103095, -0.18809788366517577, -0.16087740626092414, -2.3190019983436607, -0.38242246216979797, 6.7941374704657855, -1.8578489064126542, -0.7435563818617944, -0.6409860832689794, -0.5140485302667079, 2.8674171586002757, 0.3934655187573756, -5.1136588154108225, 0.32067479891806167, -0.29505490187026207, -0.4240691151634072, -0.4900378220779533, -1.4603512374543075))"
No,40.69,No,No,No,0.0,0.0,Yes,Male,65-69,White,No,Yes,Good,10.0,No,No,No,0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 12, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 37, indices -> List(0, 1, 2, 5, 17, 22, 25, 27, 30, 31, 32, 33, 36), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 40.69, 10.0))","Map(vectorType -> dense, length -> 37, values -> List(0.8378890651922588, 0.27031932768108335, 0.19803997771650705, -2.490170447970634, -1.050737572298622, 2.8920790632385596, -0.34312967856033766, -0.3280114805839616, -0.320307009406408, -0.293618725726171, -0.2858260422924588, -0.2704125568427208, -0.26834937448049073, -0.2655397840254846, -0.26514820975723175, -0.2620545971233728, -0.24958653540225773, 0.5515036427430173, -0.30639949428912294, -0.2779800611103095, -0.18809788366517577, -0.16087740626092414, 0.4312186335807422, -0.38242246216979797, -0.14718525748761555, 0.5382552206183824, -0.7435563818617944, 1.5600913952715747, -0.5140485302667079, -0.34874481726436696, 0.3934655187573756, 0.19555408545895056, 0.32067479891806167, 1.9453125484328917, -0.4240691151634072, -0.4900378220779533, 2.021525792204928))"


#Logistic Regression and Random Forest Models from Spark MLlib with 5-fold cross validation

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics


In [None]:
train, test = dataset.randomSplit([0.70, 0.30], seed = 1337)

In [None]:
lr = LogisticRegression(labelCol="label", featuresCol="scaledFeatures")

In [None]:
lrparamGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.5, 1.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [50,100,150])
             .build())

In [None]:
lrevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", metricName = "areaUnderROC")

In [None]:
# Create 5-fold CrossValidator
lrcv = CrossValidator(estimator = lr,
                    estimatorParamMaps = lrparamGrid,
                    evaluator = lrevaluator,
                    numFolds = 5)

In [None]:
lrcvModel = lrcv.fit(train)
print(lrcvModel)

CrossValidatorModel_50585365d4b5


In [None]:
lrpredictions = lrcvModel.transform(test)

In [None]:
lrcvModel.bestModel.extractParamMap()


Out[92]: {Param(parent='LogisticRegression_06ec265816fa', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,
 Param(parent='LogisticRegression_06ec265816fa', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
 Param(parent='LogisticRegression_06ec265816fa', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial'): 'auto',
 Param(parent='LogisticRegression_06ec265816fa', name='featuresCol', doc='features column name.'): 'scaledFeatures',
 Param(parent='LogisticRegression_06ec265816fa', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LogisticRegression_06ec265816fa', name='labelCol', doc='label column name.'): 'label',
 Param(parent='LogisticRegression_06ec265816fa', name='maxBlockSizeInMB', doc='maximum mem

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
rf = RandomForestClassifier(labelCol="label", featuresCol="scaledFeatures")

In [None]:
rfparamGrid = (ParamGridBuilder()
               .addGrid(rf.maxDepth, [5,10])
               .addGrid(rf.maxBins, [5,10])
               .addGrid(rf.numTrees, [10,20])
             .build())

In [None]:
rfevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

In [None]:
# Create 5-fold CrossValidator
rfcv = CrossValidator(estimator = rf,
                      estimatorParamMaps = rfparamGrid,
                      evaluator = rfevaluator,
                      numFolds = 5)

In [None]:
rfcvModel = rfcv.fit(train)
print(rfcvModel)

CrossValidatorModel_f2d87dba8541


In [None]:
rfpredictions = rfcvModel.transform(test)

#Model evaluation
`

In [None]:
#Logistic Regression
print('Accuracy:', lrevaluator.evaluate(lrpredictions))
print('AUC:', BinaryClassificationMetrics(lrpredictions['label','prediction'].rdd).areaUnderROC)
print('PR:', BinaryClassificationMetrics(lrpredictions['label','prediction'].rdd).areaUnderPR)

Accuracy: 0.8314196167801845
AUC: 0.781310833908434
PR: 0.02037859655186337


In [None]:
#Random Forest
print('Accuracy:', rfevaluator.evaluate(rfpredictions))
print('AUC:', BinaryClassificationMetrics(rfpredictions['label','prediction'].rdd).areaUnderROC)
print('PR:', BinaryClassificationMetrics(rfpredictions['label','prediction'].rdd).areaUnderPR)

Accuracy: 0.8173242333374205
AUC: 0.7751955815462314
PR: 0.027315534025121328


In [None]:
dtparamGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [5,10])
             .addGrid(dt.maxBins, [30,40])
             .build())

In [None]:
dtcvModel = dtcv.fit(train)
print(dtcvModel)

CrossValidatorModel_d146ea2f4d2b


#Saving Transformation Pipeline and Models to my mounted storage

In [None]:
pipelineModel.save("/mnt/dsba6190-gamma-rg/Anulitha-V/pipeline")
display(dbutils.fs.ls("/mnt/dsba6190-gamma-rg/Anulitha-V/pipeline"))

path,name,size,modificationTime
dbfs:/mnt/dsba6190-gamma-rg/Anulitha-V/pipeline/metadata/,metadata/,0,0
dbfs:/mnt/dsba6190-gamma-rg/Anulitha-V/pipeline/stages/,stages/,0,0


In [None]:
lrcvModel.save("/mnt/dsba6190-gamma-rg/Anulitha-V/trainedmodels/lrcv")
rfcvModel.save("mnt/dsba6190-gamma-rg/Anulitha-V/trainedmodels/rfcv")
display(dbutils.fs.ls("mnt/dsba6190-gamma-rg/Anulitha-V/trainedmodels"))

path,name,size,modificationTime
dbfs:/mnt/dsba6190-gamma-rg/Anulitha-V/trainedmodels/lrcv/,lrcv/,0,0
dbfs:/mnt/dsba6190-gamma-rg/Anulitha-V/trainedmodels/rfcv/,rfcv/,0,0
