## DS 5110 Final Project - Hospital Mortality
### By: Elena Tsvetskova, Brian Chae, Ryan Viti (rrv7eb)

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col

spark = SparkSession.builder \
        .master("local") \
        .appName("Team 3 Final Project") \
        .getOrCreate()

filename = "hospital_mortality.csv"

data = spark.read.csv(filename, inferSchema=True, header=True)
data.show(2)

+------------+----------+-----------+---+-----+----------------+---------+------+------+----------------+------+-------------+------------+----------------+------+------------------+-------------------+---------------------+----------+---------------+----------------+-----------------+-----------------+-----------------+----------------+----------+---------------+-----------+-----------------+-------------+-------------+-------------------------+-------------------------+----------------+----------------+----------+----------+----------------------+----------------------+---------------+---------------+-----------+-----------+------------+------------+------------------------+------------------------+-----------+-----------+-------------+-------------+-------------------------+-------------------------+----------------+----------------+----------+----------+----------------------+----------------------+---------------+---------------+-----------+-----------+------------+------------+--

In [2]:
data.printSchema()

root
 |-- encounter_id: integer (nullable = true)
 |-- patient_id: integer (nullable = true)
 |-- hospital_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- bmi: double (nullable = true)
 |-- elective_surgery: integer (nullable = true)
 |-- ethnicity: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- height: double (nullable = true)
 |-- icu_admit_source: string (nullable = true)
 |-- icu_id: integer (nullable = true)
 |-- icu_stay_type: string (nullable = true)
 |-- icu_type: string (nullable = true)
 |-- pre_icu_los_days: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- apache_2_diagnosis: integer (nullable = true)
 |-- apache_3j_diagnosis: double (nullable = true)
 |-- apache_post_operative: integer (nullable = true)
 |-- arf_apache: integer (nullable = true)
 |-- gcs_eyes_apache: integer (nullable = true)
 |-- gcs_motor_apache: integer (nullable = true)
 |-- gcs_unable_apache: integer (nullable = true)
 |-- gcs_verbal_apache

## Data Cleaning and EDA

In [3]:
# empty columns and unique identifiers unnecessary for analysis

cleaned_data = data.drop("_c83", "encounter_id", "patient_id", "hospital_id", "icu_id", "icu_stay_type")
cleaned_data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- bmi: double (nullable = true)
 |-- elective_surgery: integer (nullable = true)
 |-- ethnicity: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- height: double (nullable = true)
 |-- icu_admit_source: string (nullable = true)
 |-- icu_type: string (nullable = true)
 |-- pre_icu_los_days: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- apache_2_diagnosis: integer (nullable = true)
 |-- apache_3j_diagnosis: double (nullable = true)
 |-- apache_post_operative: integer (nullable = true)
 |-- arf_apache: integer (nullable = true)
 |-- gcs_eyes_apache: integer (nullable = true)
 |-- gcs_motor_apache: integer (nullable = true)
 |-- gcs_unable_apache: integer (nullable = true)
 |-- gcs_verbal_apache: integer (nullable = true)
 |-- heart_rate_apache: integer (nullable = true)
 |-- intubated_apache: integer (nullable = true)
 |-- map_apache: integer (nullable = true)
 |-- resprate_apache: double (nullable = true)

In [16]:
# load pyspark modules
from pyspark.ml import Pipeline  
from pyspark.ml.feature import OneHotEncoder, StringIndexer

#create a list of the columns that are string typed
categoricalColumns = [item[0] for item in cleaned_data.dtypes if item[1].startswith('string') ]

#define a list of stages in your pipeline. The string indexer will be one stage
stages = []

#iterate through all categorical values
for categoricalCol in categoricalColumns:
    #create a string indexer for those categorical values and assign a new name including the word 'Index'
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')

    #append the string Indexer to our list of stages
    stages += [stringIndexer]

#Create the pipeline. Assign the satges list to the pipeline key word stages
pipeline = Pipeline(stages = stages)
#fit the pipeline to our dataframe
pipelineModel = pipeline.fit(cleaned_data)
#transform the dataframe
df= pipelineModel.transform(cleaned_data)
df.take(2)


[Row(age=68, bmi=22.73, elective_surgery=0, ethnicity='Caucasian', gender='M', height=180.3, icu_admit_source='Floor', icu_type='CTICU', pre_icu_los_days=0.541666667, weight=73.9, apache_2_diagnosis=113, apache_3j_diagnosis=502.01, apache_post_operative=0, arf_apache=0, gcs_eyes_apache=3, gcs_motor_apache=6, gcs_unable_apache=0, gcs_verbal_apache=4, heart_rate_apache=118, intubated_apache=0, map_apache=40, resprate_apache=36.0, temp_apache=39.3, ventilated_apache=0, d1_diasbp_max=68, d1_diasbp_min=37, d1_diasbp_noninvasive_max=68, d1_diasbp_noninvasive_min=37, d1_heartrate_max=119, d1_heartrate_min=72, d1_mbp_max=89, d1_mbp_min=46, d1_mbp_noninvasive_max=89, d1_mbp_noninvasive_min=46, d1_resprate_max=34, d1_resprate_min=10, d1_spo2_max=100, d1_spo2_min=74, d1_sysbp_max=131, d1_sysbp_min=73, d1_sysbp_noninvasive_max=131, d1_sysbp_noninvasive_min=73.0, d1_temp_max=39.9, d1_temp_min=37.2, h1_diasbp_max=68, h1_diasbp_min=63, h1_diasbp_noninvasive_max=68, h1_diasbp_noninvasive_min=63, h1_he

In [48]:
df_2 = df.drop("ethnicity", "gender", "icu_admit_source", "icu_type", "apache_3j_bodysystem", "apache_2_bodysystem")
df_2.printSchema()

root
 |-- age: integer (nullable = true)
 |-- bmi: double (nullable = true)
 |-- elective_surgery: integer (nullable = true)
 |-- height: double (nullable = true)
 |-- pre_icu_los_days: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- apache_2_diagnosis: integer (nullable = true)
 |-- apache_3j_diagnosis: double (nullable = true)
 |-- apache_post_operative: integer (nullable = true)
 |-- arf_apache: integer (nullable = true)
 |-- gcs_eyes_apache: integer (nullable = true)
 |-- gcs_motor_apache: integer (nullable = true)
 |-- gcs_unable_apache: integer (nullable = true)
 |-- gcs_verbal_apache: integer (nullable = true)
 |-- heart_rate_apache: integer (nullable = true)
 |-- intubated_apache: integer (nullable = true)
 |-- map_apache: integer (nullable = true)
 |-- resprate_apache: double (nullable = true)
 |-- temp_apache: double (nullable = true)
 |-- ventilated_apache: integer (nullable = true)
 |-- d1_diasbp_max: integer (nullable = true)
 |-- d1_diasbp_min: integer

IllegalArgumentException: requirement failed: The number of columns doesn't match.
Old column names (79): age, bmi, elective_surgery, height, pre_icu_los_days, weight, apache_2_diagnosis, apache_3j_diagnosis, apache_post_operative, arf_apache, gcs_eyes_apache, gcs_motor_apache, gcs_unable_apache, gcs_verbal_apache, heart_rate_apache, intubated_apache, map_apache, resprate_apache, temp_apache, ventilated_apache, d1_diasbp_max, d1_diasbp_min, d1_diasbp_noninvasive_max, d1_diasbp_noninvasive_min, d1_heartrate_max, d1_heartrate_min, d1_mbp_max, d1_mbp_min, d1_mbp_noninvasive_max, d1_mbp_noninvasive_min, d1_resprate_max, d1_resprate_min, d1_spo2_max, d1_spo2_min, d1_sysbp_max, d1_sysbp_min, d1_sysbp_noninvasive_max, d1_sysbp_noninvasive_min, d1_temp_max, d1_temp_min, h1_diasbp_max, h1_diasbp_min, h1_diasbp_noninvasive_max, h1_diasbp_noninvasive_min, h1_heartrate_max, h1_heartrate_min, h1_mbp_max, h1_mbp_min, h1_mbp_noninvasive_max, h1_mbp_noninvasive_min, h1_resprate_max, h1_resprate_min, h1_spo2_max, h1_spo2_min, h1_sysbp_max, h1_sysbp_min, h1_sysbp_noninvasive_max, h1_sysbp_noninvasive_min, d1_glucose_max, d1_glucose_min, d1_potassium_max, d1_potassium_min, apache_4a_hospital_death_prob, apache_4a_icu_death_prob, aids, cirrhosis, diabetes_mellitus, hepatic_failure, immunosuppression, leukemia, lymphoma, solid_tumor_with_metastasis, hospital_death, ethnicityIndex, genderIndex, icu_admit_sourceIndex, icu_typeIndex, apache_3j_bodysystemIndex, apache_2_bodysystemIndex
New column names (0): 

In [54]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

#feature_columns = df_2.select([c for c in df_2.columns if c not in {'hospital_death'}])
#feature_columns.take(1)

va = VectorAssembler(inputCols=[c for c in df_2.columns if c not in {'hospital_death'}], outputCol="features")  
output = va.transform(df_2)
scaler = StandardScaler(
    inputCol = 'features', 
    outputCol = 'scaledFeatures',
    withMean = True,
    withStd = True
).fit(output)

# when we transform the dataframe, the old
# feature will still remain in it
df_scaled = scaler.transform(output)
df_scaled.take(5)

Py4JJavaError: An error occurred while calling o2000.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 88.0 failed 1 times, most recent failure: Lost task 0.0 in stage 88.0 (TID 87) (udc-ba34-13c0 executor driver): org.apache.spark.SparkException: Failed to execute user defined function(VectorAssembler$$Lambda$3174/0x000000084130b840: (struct<age_double_VectorAssembler_dec361fd3025:double,bmi:double,elective_surgery_double_VectorAssembler_dec361fd3025:double,height:double,pre_icu_los_days:double,weight:double,apache_2_diagnosis_double_VectorAssembler_dec361fd3025:double,apache_3j_diagnosis:double,apache_post_operative_double_VectorAssembler_dec361fd3025:double,arf_apache_double_VectorAssembler_dec361fd3025:double,gcs_eyes_apache_double_VectorAssembler_dec361fd3025:double,gcs_motor_apache_double_VectorAssembler_dec361fd3025:double,gcs_unable_apache_double_VectorAssembler_dec361fd3025:double,gcs_verbal_apache_double_VectorAssembler_dec361fd3025:double,heart_rate_apache_double_VectorAssembler_dec361fd3025:double,intubated_apache_double_VectorAssembler_dec361fd3025:double,map_apache_double_VectorAssembler_dec361fd3025:double,resprate_apache:double,temp_apache:double,ventilated_apache_double_VectorAssembler_dec361fd3025:double,d1_diasbp_max_double_VectorAssembler_dec361fd3025:double,d1_diasbp_min_double_VectorAssembler_dec361fd3025:double,d1_diasbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,d1_diasbp_noninvasive_min_double_VectorAssembler_dec361fd3025:double,d1_heartrate_max_double_VectorAssembler_dec361fd3025:double,d1_heartrate_min_double_VectorAssembler_dec361fd3025:double,d1_mbp_max_double_VectorAssembler_dec361fd3025:double,d1_mbp_min_double_VectorAssembler_dec361fd3025:double,d1_mbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,d1_mbp_noninvasive_min_double_VectorAssembler_dec361fd3025:double,d1_resprate_max_double_VectorAssembler_dec361fd3025:double,d1_resprate_min_double_VectorAssembler_dec361fd3025:double,d1_spo2_max_double_VectorAssembler_dec361fd3025:double,d1_spo2_min_double_VectorAssembler_dec361fd3025:double,d1_sysbp_max_double_VectorAssembler_dec361fd3025:double,d1_sysbp_min_double_VectorAssembler_dec361fd3025:double,d1_sysbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,d1_sysbp_noninvasive_min:double,d1_temp_max:double,d1_temp_min:double,h1_diasbp_max_double_VectorAssembler_dec361fd3025:double,h1_diasbp_min_double_VectorAssembler_dec361fd3025:double,h1_diasbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,h1_diasbp_noninvasive_min_double_VectorAssembler_dec361fd3025:double,h1_heartrate_max_double_VectorAssembler_dec361fd3025:double,h1_heartrate_min_double_VectorAssembler_dec361fd3025:double,h1_mbp_max_double_VectorAssembler_dec361fd3025:double,h1_mbp_min_double_VectorAssembler_dec361fd3025:double,h1_mbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,h1_mbp_noninvasive_min_double_VectorAssembler_dec361fd3025:double,h1_resprate_max_double_VectorAssembler_dec361fd3025:double,h1_resprate_min_double_VectorAssembler_dec361fd3025:double,h1_spo2_max_double_VectorAssembler_dec361fd3025:double,h1_spo2_min_double_VectorAssembler_dec361fd3025:double,h1_sysbp_max_double_VectorAssembler_dec361fd3025:double,h1_sysbp_min_double_VectorAssembler_dec361fd3025:double,h1_sysbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,h1_sysbp_noninvasive_min_double_VectorAssembler_dec361fd3025:double,d1_glucose_max_double_VectorAssembler_dec361fd3025:double,d1_glucose_min_double_VectorAssembler_dec361fd3025:double,d1_potassium_max:double,d1_potassium_min:double,apache_4a_hospital_death_prob:double,apache_4a_icu_death_prob:double,aids_double_VectorAssembler_dec361fd3025:double,cirrhosis_double_VectorAssembler_dec361fd3025:double,diabetes_mellitus_double_VectorAssembler_dec361fd3025:double,hepatic_failure_double_VectorAssembler_dec361fd3025:double,immunosuppression_double_VectorAssembler_dec361fd3025:double,leukemia_double_VectorAssembler_dec361fd3025:double,lymphoma_double_VectorAssembler_dec361fd3025:double,solid_tumor_with_metastasis_double_VectorAssembler_dec361fd3025:double,ethnicityIndex:double,genderIndex:double,icu_admit_sourceIndex:double,icu_typeIndex:double,apache_3j_bodysystemIndex:double,apache_2_bodysystemIndex:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.processInputs(ObjectAggregationIterator.scala:150)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.<init>(ObjectAggregationIterator.scala:77)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$2(ObjectHashAggregateExec.scala:107)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$2$adapted(ObjectHashAggregateExec.scala:85)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:885)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:885)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 25 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2261)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:390)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2729)
	at org.apache.spark.sql.Dataset.first(Dataset.scala:2736)
	at org.apache.spark.ml.feature.StandardScaler.fit(StandardScaler.scala:113)
	at org.apache.spark.ml.feature.StandardScaler.fit(StandardScaler.scala:84)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(VectorAssembler$$Lambda$3174/0x000000084130b840: (struct<age_double_VectorAssembler_dec361fd3025:double,bmi:double,elective_surgery_double_VectorAssembler_dec361fd3025:double,height:double,pre_icu_los_days:double,weight:double,apache_2_diagnosis_double_VectorAssembler_dec361fd3025:double,apache_3j_diagnosis:double,apache_post_operative_double_VectorAssembler_dec361fd3025:double,arf_apache_double_VectorAssembler_dec361fd3025:double,gcs_eyes_apache_double_VectorAssembler_dec361fd3025:double,gcs_motor_apache_double_VectorAssembler_dec361fd3025:double,gcs_unable_apache_double_VectorAssembler_dec361fd3025:double,gcs_verbal_apache_double_VectorAssembler_dec361fd3025:double,heart_rate_apache_double_VectorAssembler_dec361fd3025:double,intubated_apache_double_VectorAssembler_dec361fd3025:double,map_apache_double_VectorAssembler_dec361fd3025:double,resprate_apache:double,temp_apache:double,ventilated_apache_double_VectorAssembler_dec361fd3025:double,d1_diasbp_max_double_VectorAssembler_dec361fd3025:double,d1_diasbp_min_double_VectorAssembler_dec361fd3025:double,d1_diasbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,d1_diasbp_noninvasive_min_double_VectorAssembler_dec361fd3025:double,d1_heartrate_max_double_VectorAssembler_dec361fd3025:double,d1_heartrate_min_double_VectorAssembler_dec361fd3025:double,d1_mbp_max_double_VectorAssembler_dec361fd3025:double,d1_mbp_min_double_VectorAssembler_dec361fd3025:double,d1_mbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,d1_mbp_noninvasive_min_double_VectorAssembler_dec361fd3025:double,d1_resprate_max_double_VectorAssembler_dec361fd3025:double,d1_resprate_min_double_VectorAssembler_dec361fd3025:double,d1_spo2_max_double_VectorAssembler_dec361fd3025:double,d1_spo2_min_double_VectorAssembler_dec361fd3025:double,d1_sysbp_max_double_VectorAssembler_dec361fd3025:double,d1_sysbp_min_double_VectorAssembler_dec361fd3025:double,d1_sysbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,d1_sysbp_noninvasive_min:double,d1_temp_max:double,d1_temp_min:double,h1_diasbp_max_double_VectorAssembler_dec361fd3025:double,h1_diasbp_min_double_VectorAssembler_dec361fd3025:double,h1_diasbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,h1_diasbp_noninvasive_min_double_VectorAssembler_dec361fd3025:double,h1_heartrate_max_double_VectorAssembler_dec361fd3025:double,h1_heartrate_min_double_VectorAssembler_dec361fd3025:double,h1_mbp_max_double_VectorAssembler_dec361fd3025:double,h1_mbp_min_double_VectorAssembler_dec361fd3025:double,h1_mbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,h1_mbp_noninvasive_min_double_VectorAssembler_dec361fd3025:double,h1_resprate_max_double_VectorAssembler_dec361fd3025:double,h1_resprate_min_double_VectorAssembler_dec361fd3025:double,h1_spo2_max_double_VectorAssembler_dec361fd3025:double,h1_spo2_min_double_VectorAssembler_dec361fd3025:double,h1_sysbp_max_double_VectorAssembler_dec361fd3025:double,h1_sysbp_min_double_VectorAssembler_dec361fd3025:double,h1_sysbp_noninvasive_max_double_VectorAssembler_dec361fd3025:double,h1_sysbp_noninvasive_min_double_VectorAssembler_dec361fd3025:double,d1_glucose_max_double_VectorAssembler_dec361fd3025:double,d1_glucose_min_double_VectorAssembler_dec361fd3025:double,d1_potassium_max:double,d1_potassium_min:double,apache_4a_hospital_death_prob:double,apache_4a_icu_death_prob:double,aids_double_VectorAssembler_dec361fd3025:double,cirrhosis_double_VectorAssembler_dec361fd3025:double,diabetes_mellitus_double_VectorAssembler_dec361fd3025:double,hepatic_failure_double_VectorAssembler_dec361fd3025:double,immunosuppression_double_VectorAssembler_dec361fd3025:double,leukemia_double_VectorAssembler_dec361fd3025:double,lymphoma_double_VectorAssembler_dec361fd3025:double,solid_tumor_with_metastasis_double_VectorAssembler_dec361fd3025:double,ethnicityIndex:double,genderIndex:double,icu_admit_sourceIndex:double,icu_typeIndex:double,apache_3j_bodysystemIndex:double,apache_2_bodysystemIndex:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.processInputs(ObjectAggregationIterator.scala:150)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.<init>(ObjectAggregationIterator.scala:77)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$2(ObjectHashAggregateExec.scala:107)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$2$adapted(ObjectHashAggregateExec.scala:85)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:885)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:885)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 25 more


In [12]:
# Subsetting data by type of data for EDA purposes
#integer_data = data.select([data.dtypes[i][0] for i in range(len(data.dtypes)) if (data.dtypes[i][1] == 'int') and (data.select(data[i]).distinct().count() >= 10)])

#for i in integer_data.columns:
#    integer_data.describe(i).show()
#    integer_data.select(F.percentile_approx(i, [0.25, 0.75], 100000).alias("quantiles")).show()

In [10]:
# hospital_deaths = data.select("hospital_death").filter(col("hospital_death")==1 | col("hospital_death")).groupBy("hospital_death").count()
# hospital_deaths = hospital_deaths.withColumn('percent', )
# hospital_deaths.show()

In [None]:
# data.hospital_deaths.show()

In [None]:
# data.select("hospital_death").show()

### Principal Component Analysis (PCA)

In [39]:
from pyspark.ml.feature import PCA

# PCA using 4 components
pca = PCA(k=4, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(output)

# extract the transformed features
result = model.transform(output)

result.take(2)

Py4JJavaError: An error occurred while calling o1715.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 79.0 failed 1 times, most recent failure: Lost task 0.0 in stage 79.0 (TID 79) (udc-ba34-13c0 executor driver): org.apache.spark.SparkException: Failed to execute user defined function(VectorAssembler$$Lambda$3174/0x000000084130b840: (struct<age_double_VectorAssembler_6631368487ec:double,bmi:double,elective_surgery_double_VectorAssembler_6631368487ec:double,height:double,pre_icu_los_days:double,weight:double,apache_2_diagnosis_double_VectorAssembler_6631368487ec:double,apache_3j_diagnosis:double,apache_post_operative_double_VectorAssembler_6631368487ec:double,arf_apache_double_VectorAssembler_6631368487ec:double,gcs_eyes_apache_double_VectorAssembler_6631368487ec:double,gcs_motor_apache_double_VectorAssembler_6631368487ec:double,gcs_unable_apache_double_VectorAssembler_6631368487ec:double,gcs_verbal_apache_double_VectorAssembler_6631368487ec:double,heart_rate_apache_double_VectorAssembler_6631368487ec:double,intubated_apache_double_VectorAssembler_6631368487ec:double,map_apache_double_VectorAssembler_6631368487ec:double,resprate_apache:double,temp_apache:double,ventilated_apache_double_VectorAssembler_6631368487ec:double,d1_diasbp_max_double_VectorAssembler_6631368487ec:double,d1_diasbp_min_double_VectorAssembler_6631368487ec:double,d1_diasbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,d1_diasbp_noninvasive_min_double_VectorAssembler_6631368487ec:double,d1_heartrate_max_double_VectorAssembler_6631368487ec:double,d1_heartrate_min_double_VectorAssembler_6631368487ec:double,d1_mbp_max_double_VectorAssembler_6631368487ec:double,d1_mbp_min_double_VectorAssembler_6631368487ec:double,d1_mbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,d1_mbp_noninvasive_min_double_VectorAssembler_6631368487ec:double,d1_resprate_max_double_VectorAssembler_6631368487ec:double,d1_resprate_min_double_VectorAssembler_6631368487ec:double,d1_spo2_max_double_VectorAssembler_6631368487ec:double,d1_spo2_min_double_VectorAssembler_6631368487ec:double,d1_sysbp_max_double_VectorAssembler_6631368487ec:double,d1_sysbp_min_double_VectorAssembler_6631368487ec:double,d1_sysbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,d1_sysbp_noninvasive_min:double,d1_temp_max:double,d1_temp_min:double,h1_diasbp_max_double_VectorAssembler_6631368487ec:double,h1_diasbp_min_double_VectorAssembler_6631368487ec:double,h1_diasbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,h1_diasbp_noninvasive_min_double_VectorAssembler_6631368487ec:double,h1_heartrate_max_double_VectorAssembler_6631368487ec:double,h1_heartrate_min_double_VectorAssembler_6631368487ec:double,h1_mbp_max_double_VectorAssembler_6631368487ec:double,h1_mbp_min_double_VectorAssembler_6631368487ec:double,h1_mbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,h1_mbp_noninvasive_min_double_VectorAssembler_6631368487ec:double,h1_resprate_max_double_VectorAssembler_6631368487ec:double,h1_resprate_min_double_VectorAssembler_6631368487ec:double,h1_spo2_max_double_VectorAssembler_6631368487ec:double,h1_spo2_min_double_VectorAssembler_6631368487ec:double,h1_sysbp_max_double_VectorAssembler_6631368487ec:double,h1_sysbp_min_double_VectorAssembler_6631368487ec:double,h1_sysbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,h1_sysbp_noninvasive_min_double_VectorAssembler_6631368487ec:double,d1_glucose_max_double_VectorAssembler_6631368487ec:double,d1_glucose_min_double_VectorAssembler_6631368487ec:double,d1_potassium_max:double,d1_potassium_min:double,apache_4a_hospital_death_prob:double,apache_4a_icu_death_prob:double,aids_double_VectorAssembler_6631368487ec:double,cirrhosis_double_VectorAssembler_6631368487ec:double,diabetes_mellitus_double_VectorAssembler_6631368487ec:double,hepatic_failure_double_VectorAssembler_6631368487ec:double,immunosuppression_double_VectorAssembler_6631368487ec:double,leukemia_double_VectorAssembler_6631368487ec:double,lymphoma_double_VectorAssembler_6631368487ec:double,solid_tumor_with_metastasis_double_VectorAssembler_6631368487ec:double,ethnicityIndex:double,genderIndex:double,icu_admit_sourceIndex:double,icu_typeIndex:double,apache_3j_bodysystemIndex:double,apache_2_bodysystemIndex:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:162)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:160)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:219)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:219)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1429)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$3(RDD.scala:1230)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$5(RDD.scala:1231)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 31 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2291)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1183)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1177)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1222)
	at org.apache.spark.mllib.stat.Statistics$.colStats(Statistics.scala:58)
	at org.apache.spark.mllib.linalg.distributed.RowMatrix.computeCovariance(RowMatrix.scala:436)
	at org.apache.spark.mllib.linalg.distributed.RowMatrix.computePrincipalComponentsAndExplainedVariance(RowMatrix.scala:479)
	at org.apache.spark.mllib.feature.PCA.fit(PCA.scala:65)
	at org.apache.spark.ml.feature.PCA.fit(PCA.scala:93)
	at org.apache.spark.ml.feature.PCA.fit(PCA.scala:64)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(VectorAssembler$$Lambda$3174/0x000000084130b840: (struct<age_double_VectorAssembler_6631368487ec:double,bmi:double,elective_surgery_double_VectorAssembler_6631368487ec:double,height:double,pre_icu_los_days:double,weight:double,apache_2_diagnosis_double_VectorAssembler_6631368487ec:double,apache_3j_diagnosis:double,apache_post_operative_double_VectorAssembler_6631368487ec:double,arf_apache_double_VectorAssembler_6631368487ec:double,gcs_eyes_apache_double_VectorAssembler_6631368487ec:double,gcs_motor_apache_double_VectorAssembler_6631368487ec:double,gcs_unable_apache_double_VectorAssembler_6631368487ec:double,gcs_verbal_apache_double_VectorAssembler_6631368487ec:double,heart_rate_apache_double_VectorAssembler_6631368487ec:double,intubated_apache_double_VectorAssembler_6631368487ec:double,map_apache_double_VectorAssembler_6631368487ec:double,resprate_apache:double,temp_apache:double,ventilated_apache_double_VectorAssembler_6631368487ec:double,d1_diasbp_max_double_VectorAssembler_6631368487ec:double,d1_diasbp_min_double_VectorAssembler_6631368487ec:double,d1_diasbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,d1_diasbp_noninvasive_min_double_VectorAssembler_6631368487ec:double,d1_heartrate_max_double_VectorAssembler_6631368487ec:double,d1_heartrate_min_double_VectorAssembler_6631368487ec:double,d1_mbp_max_double_VectorAssembler_6631368487ec:double,d1_mbp_min_double_VectorAssembler_6631368487ec:double,d1_mbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,d1_mbp_noninvasive_min_double_VectorAssembler_6631368487ec:double,d1_resprate_max_double_VectorAssembler_6631368487ec:double,d1_resprate_min_double_VectorAssembler_6631368487ec:double,d1_spo2_max_double_VectorAssembler_6631368487ec:double,d1_spo2_min_double_VectorAssembler_6631368487ec:double,d1_sysbp_max_double_VectorAssembler_6631368487ec:double,d1_sysbp_min_double_VectorAssembler_6631368487ec:double,d1_sysbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,d1_sysbp_noninvasive_min:double,d1_temp_max:double,d1_temp_min:double,h1_diasbp_max_double_VectorAssembler_6631368487ec:double,h1_diasbp_min_double_VectorAssembler_6631368487ec:double,h1_diasbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,h1_diasbp_noninvasive_min_double_VectorAssembler_6631368487ec:double,h1_heartrate_max_double_VectorAssembler_6631368487ec:double,h1_heartrate_min_double_VectorAssembler_6631368487ec:double,h1_mbp_max_double_VectorAssembler_6631368487ec:double,h1_mbp_min_double_VectorAssembler_6631368487ec:double,h1_mbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,h1_mbp_noninvasive_min_double_VectorAssembler_6631368487ec:double,h1_resprate_max_double_VectorAssembler_6631368487ec:double,h1_resprate_min_double_VectorAssembler_6631368487ec:double,h1_spo2_max_double_VectorAssembler_6631368487ec:double,h1_spo2_min_double_VectorAssembler_6631368487ec:double,h1_sysbp_max_double_VectorAssembler_6631368487ec:double,h1_sysbp_min_double_VectorAssembler_6631368487ec:double,h1_sysbp_noninvasive_max_double_VectorAssembler_6631368487ec:double,h1_sysbp_noninvasive_min_double_VectorAssembler_6631368487ec:double,d1_glucose_max_double_VectorAssembler_6631368487ec:double,d1_glucose_min_double_VectorAssembler_6631368487ec:double,d1_potassium_max:double,d1_potassium_min:double,apache_4a_hospital_death_prob:double,apache_4a_icu_death_prob:double,aids_double_VectorAssembler_6631368487ec:double,cirrhosis_double_VectorAssembler_6631368487ec:double,diabetes_mellitus_double_VectorAssembler_6631368487ec:double,hepatic_failure_double_VectorAssembler_6631368487ec:double,immunosuppression_double_VectorAssembler_6631368487ec:double,leukemia_double_VectorAssembler_6631368487ec:double,lymphoma_double_VectorAssembler_6631368487ec:double,solid_tumor_with_metastasis_double_VectorAssembler_6631368487ec:double,ethnicityIndex:double,genderIndex:double,icu_admit_sourceIndex:double,icu_typeIndex:double,apache_3j_bodysystemIndex:double,apache_2_bodysystemIndex:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:162)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:160)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:219)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:219)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1429)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$3(RDD.scala:1230)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$5(RDD.scala:1231)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 31 more
