In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, datediff, lag, when, row_number
from pyspark.sql.window import Window

In [2]:
# Initialize Spark session
spark = SparkSession.builder.appName("ReadmissionRiskPrediction").getOrCreate()

In [3]:
# Load the ADMISSIONS.csv file into a DataFrame
df = spark.read.format("csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load("hdfs://namenode:9000/mimic-iii/ADMISSIONS.csv")

# Part I. Readmission Risk Prediction

## Step 1: Prepare the Data

In [4]:
# Convert ADMITTIME and DISCHTIME to timestamps
df = df.withColumn("ADMITTIME", col("ADMITTIME").cast("timestamp")) \
       .withColumn("DISCHTIME", col("DISCHTIME").cast("timestamp"))

# Calculate length of stay in days
df = df.withColumn("LENGTH_OF_STAY", datediff(col("DISCHTIME"), col("ADMITTIME")))

# Calculate number of previous admissions
window_spec = Window.partitionBy("SUBJECT_ID").orderBy("ADMITTIME")
df = df.withColumn("NUM_PREV_ADMISSIONS", row_number().over(window_spec) - 1)

# Create the READMISSION label (1 if readmitted within 30 days, 0 otherwise)
df = df.withColumn("READMISSION", when(datediff(lag("ADMITTIME").over(window_spec), col("DISCHTIME")) <= 30, 1).otherwise(0))

# Drop rows with null values (e.g., first admission for each patient)
df = df.na.drop()

# Show the prepared data
df.select("SUBJECT_ID", "HADM_ID", "LENGTH_OF_STAY", "NUM_PREV_ADMISSIONS", "READMISSION").show()

+----------+-------+--------------+-------------------+-----------+
|SUBJECT_ID|HADM_ID|LENGTH_OF_STAY|NUM_PREV_ADMISSIONS|READMISSION|
+----------+-------+--------------+-------------------+-----------+
|      2322| 181695|            15|                  2|          1|
|      3242| 175206|             7|                  2|          1|
|      3490| 135086|             0|                  1|          1|
|      3792| 132278|             0|                  3|          1|
|      4155| 124456|             2|                  3|          1|
|      5768| 164070|             1|                  1|          1|
|      5897| 137321|            10|                  1|          1|
|      7184| 134761|             1|                  1|          1|
|      7666| 159952|            46|                  8|          1|
|      8389| 122962|             7|                  1|          1|
|      8426| 142053|             9|                  2|          1|
|      8452| 175505|             1|             

In [5]:
from pyspark.sql.functions import col

# Count the number of readmissions and non-readmissions
readmission_counts = df.groupBy("READMISSION").count()

# Show the result
readmission_counts.show()

+-----------+-----+
|READMISSION|count|
+-----------+-----+
|          1|  693|
|          0| 1360|
+-----------+-----+



## Step 2: Feature Engineering

In [6]:
from pyspark.ml.feature import VectorAssembler

# Define the feature columns
feature_columns = ["LENGTH_OF_STAY", "NUM_PREV_ADMISSIONS"]

# Assemble features into a vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

# Show the DataFrame with features
df.select("features", "READMISSION").show(truncate=False)

+----------+-----------+
|features  |READMISSION|
+----------+-----------+
|[15.0,2.0]|1          |
|[7.0,2.0] |1          |
|[0.0,1.0] |1          |
|[0.0,3.0] |1          |
|[2.0,3.0] |1          |
|[1.0,1.0] |1          |
|[10.0,1.0]|1          |
|[1.0,1.0] |1          |
|[46.0,8.0]|1          |
|[7.0,1.0] |1          |
|[9.0,2.0] |1          |
|[1.0,4.0] |1          |
|[9.0,3.0] |1          |
|[19.0,6.0]|1          |
|[12.0,2.0]|1          |
|[4.0,0.0] |0          |
|[16.0,4.0]|1          |
|[2.0,1.0] |1          |
|[5.0,2.0] |1          |
|[36.0,2.0]|1          |
+----------+-----------+
only showing top 20 rows



## Step 3: Train and Evaluate the Model

In [7]:
# Split the data into training and test sets (70% training, 30% test)
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

In [8]:
from pyspark.ml.classification import LogisticRegression

# Initialize the Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="READMISSION")

# Train the model
lr_model = lr.fit(train_data)

In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions on the test set
predictions = lr_model.transform(test_data)

# Initialize the evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="READMISSION", predictionCol="prediction", metricName="accuracy")

# Calculate accuracy
accuracy = evaluator.evaluate(predictions)

# Print the accuracy
print("Accuracy:", accuracy)

Accuracy: 1.0


In [10]:
# Calculate precision
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
print("Precision:", precision)

# Calculate recall
recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)
print("Recall:", recall)

# Calculate F1-score
f1_score = evaluator.setMetricName("f1").evaluate(predictions)
print("F1-Score:", f1_score)

Precision: 1.0
Recall: 1.0
F1-Score: 1.0


In [11]:
pred_stats = predictions.groupBy("READMISSION").count()
pred_stats.show()

+-----------+-----+
|READMISSION|count|
+-----------+-----+
|          1|  214|
|          0|  397|
+-----------+-----+



In [12]:
pred_stats2 = predictions.groupBy("prediction").count()
pred_stats2.show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  397|
|       1.0|  214|
+----------+-----+



In [13]:
# Save the trained Linear Regression model to a file
model_save_path = "hdfs://namenode:9000/models/readmission_risk_prediction_lr_model"
lr_model.save(model_save_path)

# Part II. Length-of-Stay (LOS) regression analysis

In [14]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

## Step 1: Prepare the Data

In [15]:
# Load the ADMISSIONS.csv file into a DataFrame
df = spark.read.format("csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load("hdfs://namenode:9000/mimic-iii/ADMISSIONS.csv")

# Load PATIENTS.csv and rename ROW_ID to PATIENT_ROW_ID
patients_df = spark.read.format("csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load("hdfs://namenode:9000/mimic-iii/PATIENTS.csv") \
    .withColumnRenamed("ROW_ID", "PATIENT_ROW_ID")

In [16]:
# Convert ADMITTIME and DISCHTIME to timestamps
df = df.withColumn("ADMITTIME", col("ADMITTIME").cast("timestamp")) \
       .withColumn("DISCHTIME", col("DISCHTIME").cast("timestamp"))

# Calculate Length of Stay (LOS) in days
df = df.withColumn("LENGTH_OF_STAY", datediff(col("DISCHTIME"), col("ADMITTIME")))

# Drop rows with null values
df = df.na.drop()

# Show the prepared data
df.select("SUBJECT_ID", "ADMITTIME", "DISCHTIME", "LENGTH_OF_STAY").show()

+----------+-------------------+-------------------+--------------+
|SUBJECT_ID|          ADMITTIME|          DISCHTIME|LENGTH_OF_STAY|
+----------+-------------------+-------------------+--------------+
|       109|2142-08-28 19:48:00|2142-08-30 15:20:00|             2|
|       111|2144-07-01 04:12:00|2144-07-01 14:55:00|             0|
|       304|2141-05-18 17:21:00|2141-05-19 01:45:00|             1|
|       353|2153-06-27 20:15:00|2153-07-07 10:30:00|            10|
|       502|2143-10-23 21:05:00|2143-11-04 17:28:00|            12|
|       505|2154-08-23 14:01:00|2154-08-29 11:10:00|             6|
|       188|2161-11-01 17:48:00|2162-01-17 05:50:00|            77|
|       250|2188-11-12 09:22:00|2188-11-22 12:00:00|            10|
|       275|2170-10-06 03:09:00|2170-10-19 15:35:00|            13|
|       907|2163-10-01 22:51:00|2163-10-02 15:53:00|             1|
|       546|2127-04-01 16:33:00|2127-04-07 17:37:00|             6|
|       433|2164-08-13 17:22:00|2164-08-17 12:00

In [17]:
# Join ADMISSIONS with PATIENTS on SUBJECT_ID
df = df.join(patients_df, on="SUBJECT_ID", how="left")

# Convert ADMITTIME and DOB to timestamps
df = df.withColumn("ADMITTIME", col("ADMITTIME").cast("timestamp")) \
       .withColumn("DOB", col("DOB").cast("timestamp"))

# Calculate Length of Stay (LOS) in days
df = df.withColumn("LENGTH_OF_STAY", datediff(col("DISCHTIME"), col("ADMITTIME")))

# Calculate age at admission
df = df.withColumn("AGE", datediff(col("ADMITTIME"), col("DOB")) / 365)

# Drop rows with null values
df = df.na.drop()

## Step 2: Feature Engineering

In [18]:
categorical_columns = ["INSURANCE", "LANGUAGE", "RELIGION", "MARITAL_STATUS", "ETHNICITY", "GENDER"]

# Check and drop columns with only one distinct value
for col_name in categorical_columns:
    distinct_count = df.select(col_name).distinct().count()
    if distinct_count < 2:
        print(f"Dropping column '{col_name}' because it has only {distinct_count} distinct value(s).")
        categorical_columns.remove(col_name)

# Step 1: StringIndexer
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index").fit(df) for col in categorical_columns]
for indexer in indexers:
    df = indexer.transform(df)

# Step 2: OneHotEncoder
encoders = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_encoded") for col in categorical_columns]
for encoder in encoders:
    df = encoder.fit(df).transform(df)

# Step 3: Assemble features
feature_columns = ["AGE", "HOSPITAL_EXPIRE_FLAG"] + [col + "_encoded" for col in categorical_columns]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

## Step 3: Train and Evaluate the Model

In [19]:
# Split the data
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

In [20]:
# Train the model
lr = LinearRegression(featuresCol="features", labelCol="LENGTH_OF_STAY")
lr_model = lr.fit(train_data)

In [21]:
# Evaluate the model
predictions = lr_model.transform(test_data)

In [22]:
evaluator = RegressionEvaluator(labelCol="LENGTH_OF_STAY", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 10.497199129596577


In [23]:
evaluator.setMetricName("mse")
mse = evaluator.evaluate(predictions)
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 110.19118956640315


In [24]:
evaluator.setMetricName("r2")
r2 = evaluator.evaluate(predictions)
print("R-squared (R²):", r2)

R-squared (R²): -0.13062202346993468


In [27]:
# Save the trained Linear Regression model to a file
model_save_path = "hdfs://namenode:9000/models/length_of_stay_lr_model"
lr_model.save(model_save_path)

Py4JJavaError: An error occurred while calling o1219.save.
: java.io.IOException: Path hdfs://namenode:9000/models/length_of_stay_lr_model already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
