# Tentativa #1

In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier  # or any other algorithm
from pyspark.ml.evaluation import BinaryClassificationEvaluator  # or MulticlassClassificationEvaluator

## 0. Initialize Spark Session

In [7]:
spark = SparkSession.builder \
    .appName("BigDataMLProject") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

## 1. Load Data

In [8]:
#df = spark.read.csv("CHARTEVENTS.csv", header=True, inferSchema=True)

#small_df = df.sample(fraction=0.1, seed=42).limit(400)

#small_df.write.csv("miniCHARTEVENTS.csv", mode="overwrite", header=True)

In [9]:
chartevents = spark.read.csv("miniCHARTEVENTS.csv", header=True, inferSchema=True)
chartevents.head()



In [10]:
print("Schema:")
chartevents.printSchema()
print("Sample data:")
chartevents.show(5)

Schema:
root
 |-- ROW_ID: integer (nullable = true)
 |-- SUBJECT_ID: integer (nullable = true)
 |-- HADM_ID: integer (nullable = true)
 |-- ICUSTAY_ID: integer (nullable = true)
 |-- ITEMID: integer (nullable = true)
 |-- CHARTTIME: timestamp (nullable = true)
 |-- STORETIME: timestamp (nullable = true)
 |-- CGID: integer (nullable = true)
 |-- VALUE: double (nullable = true)
 |-- VALUENUM: double (nullable = true)
 |-- VALUEUOM: string (nullable = true)
 |-- ERROR: integer (nullable = true)
 |-- RESULTSTATUS: string (nullable = true)
 |-- STOPPED: string (nullable = true)

Sample data:
+------+----------+-------+----------+------+-------------------+-------------------+-----+-----+--------+--------+-------+-----+------------+-------+
+------+----------+-------+----------+------+-------------------+-------------------+-----+-----+--------+--------+-------+-----+------------+-------+
|   795|        36| 165660|    241249|224663|2134-05-12 12:00:00|2134-05-12 14:44:00|17525|  8.0|     8.

In [11]:
#df = spark.read.csv("ICUSTAYS.csv", header=True, inferSchema=True)

#small_df = df.sample(fraction=0.1, seed=42).limit(400)

#small_df.write.csv("miniICUSTAYS.csv", mode="overwrite", header=True)

In [12]:
icustays = spark.read.csv("miniICUSTAYS.csv", header=True, inferSchema=True)
icustays.head()

Row(ROW_ID=372, SUBJECT_ID=275, HADM_ID=129886, ICUSTAY_ID=219649, DBSOURCE='carevue', FIRST_CAREUNIT='CCU', LAST_CAREUNIT='CCU', FIRST_WARDID=7, LAST_WARDID=7, INTIME=datetime.datetime(2170, 10, 7, 11, 28, 53), OUTTIME=datetime.datetime(2170, 10, 14, 14, 38, 7), LOS=7.1314)

In [13]:
print("Schema:")
icustays.printSchema()
print("Sample data:")
icustays.show(5)

Schema:
root
 |-- ROW_ID: integer (nullable = true)
 |-- SUBJECT_ID: integer (nullable = true)
 |-- HADM_ID: integer (nullable = true)
 |-- ICUSTAY_ID: integer (nullable = true)
 |-- DBSOURCE: string (nullable = true)
 |-- FIRST_CAREUNIT: string (nullable = true)
 |-- LAST_CAREUNIT: string (nullable = true)
 |-- FIRST_WARDID: integer (nullable = true)
 |-- LAST_WARDID: integer (nullable = true)
 |-- INTIME: timestamp (nullable = true)
 |-- OUTTIME: timestamp (nullable = true)
 |-- LOS: double (nullable = true)

Sample data:
+------+----------+-------+----------+--------+--------------+-------------+------------+-----------+-------------------+-------------------+-------+
|ROW_ID|SUBJECT_ID|HADM_ID|ICUSTAY_ID|DBSOURCE|FIRST_CAREUNIT|LAST_CAREUNIT|FIRST_WARDID|LAST_WARDID|             INTIME|            OUTTIME|    LOS|
+------+----------+-------+----------+--------+--------------+-------------+------------+-----------+-------------------+-------------------+-------+
|   372|       275| 

In [14]:
#df = spark.read.csv("ADMISSIONS.csv", header=True, inferSchema=True)

#small_df = df.sample(fraction=0.1, seed=42).limit(400)

#small_df.write.csv("miniADMISSIONS.csv", mode="overwrite", header=True)

In [15]:
admissions = spark.read.csv("miniADMISSIONS.csv", header=True, inferSchema=True)
admissions.head()

Row(ROW_ID=28, SUBJECT_ID=28, HADM_ID=162569, ADMITTIME=datetime.datetime(2177, 9, 1, 7, 15), DISCHTIME=datetime.datetime(2177, 9, 6, 16, 0), DEATHTIME=None, ADMISSION_TYPE='ELECTIVE', ADMISSION_LOCATION='PHYS REFERRAL/NORMAL DELI', DISCHARGE_LOCATION='HOME HEALTH CARE', INSURANCE='Medicare', LANGUAGE=None, RELIGION='CATHOLIC', MARITAL_STATUS='MARRIED', ETHNICITY='WHITE', EDREGTIME=None, EDOUTTIME=None, DIAGNOSIS='CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS GRAFT/SDA', HOSPITAL_EXPIRE_FLAG=0, HAS_CHARTEVENTS_DATA=1)

In [16]:
print("Schema:")
admissions.printSchema()
print("Sample data:")
admissions.show(5)

Schema:
root
 |-- ROW_ID: integer (nullable = true)
 |-- SUBJECT_ID: integer (nullable = true)
 |-- HADM_ID: integer (nullable = true)
 |-- ADMITTIME: timestamp (nullable = true)
 |-- DISCHTIME: timestamp (nullable = true)
 |-- DEATHTIME: timestamp (nullable = true)
 |-- ADMISSION_TYPE: string (nullable = true)
 |-- ADMISSION_LOCATION: string (nullable = true)
 |-- DISCHARGE_LOCATION: string (nullable = true)
 |-- INSURANCE: string (nullable = true)
 |-- LANGUAGE: string (nullable = true)
 |-- RELIGION: string (nullable = true)
 |-- MARITAL_STATUS: string (nullable = true)
 |-- ETHNICITY: string (nullable = true)
 |-- EDREGTIME: timestamp (nullable = true)
 |-- EDOUTTIME: timestamp (nullable = true)
 |-- DIAGNOSIS: string (nullable = true)
 |-- HOSPITAL_EXPIRE_FLAG: integer (nullable = true)
 |-- HAS_CHARTEVENTS_DATA: integer (nullable = true)

Sample data:
+------+----------+-------+-------------------+-------------------+---------+--------------+--------------------+-----------------

In [17]:
#df = spark.read.csv("LABEVENTS.csv", header=True, inferSchema=True)

#small_df = df.sample(fraction=0.1, seed=42).limit(400)

#small_df.write.csv("miniLABEVENTS.csv", mode="overwrite", header=True)

In [18]:
labevents = spark.read.csv("miniADMISSIONS.csv", header=True, inferSchema=True)
labevents.head()
print("Schema:")
labevents.printSchema()
print("Sample data:")
labevents.show(5)

Schema:
root
 |-- ROW_ID: integer (nullable = true)
 |-- SUBJECT_ID: integer (nullable = true)
 |-- HADM_ID: integer (nullable = true)
 |-- ADMITTIME: timestamp (nullable = true)
 |-- DISCHTIME: timestamp (nullable = true)
 |-- DEATHTIME: timestamp (nullable = true)
 |-- ADMISSION_TYPE: string (nullable = true)
 |-- ADMISSION_LOCATION: string (nullable = true)
 |-- DISCHARGE_LOCATION: string (nullable = true)
 |-- INSURANCE: string (nullable = true)
 |-- LANGUAGE: string (nullable = true)
 |-- RELIGION: string (nullable = true)
 |-- MARITAL_STATUS: string (nullable = true)
 |-- ETHNICITY: string (nullable = true)
 |-- EDREGTIME: timestamp (nullable = true)
 |-- EDOUTTIME: timestamp (nullable = true)
 |-- DIAGNOSIS: string (nullable = true)
 |-- HOSPITAL_EXPIRE_FLAG: integer (nullable = true)
 |-- HAS_CHARTEVENTS_DATA: integer (nullable = true)

Sample data:
+------+----------+-------+-------------------+-------------------+---------+--------------+--------------------+-----------------

In [19]:
#df = spark.read.csv("DIAGNOSES_ICD.csv", header=True, inferSchema=True)

#small_df = df.sample(fraction=0.1, seed=42).limit(400)

#small_df.write.csv("miniDIAGNOSES_ICD.csv", mode="overwrite", header=True)

In [20]:
diagnoses = spark.read.csv("miniDIAGNOSES_ICD.csv", header=True, inferSchema=True)
diagnoses.head()
print("Schema:")
diagnoses.printSchema()
print("Sample data:")
diagnoses.show(5)

Schema:
root
 |-- row_id: integer (nullable = true)
 |-- subject_id: integer (nullable = true)
 |-- hadm_id: integer (nullable = true)
 |-- seq_num: integer (nullable = true)
 |-- icd9_code: string (nullable = true)

Sample data:
+------+----------+-------+-------+---------+
|row_id|subject_id|hadm_id|seq_num|icd9_code|
+------+----------+-------+-------+---------+
|112351|     10006| 142345|      8|     4240|
|112360|     10006| 142345|     17|    V5867|
|112362|     10006| 142345|     19|    41401|
|112458|     10019| 177759|      1|     0389|
|112466|     10019| 177759|      9|    78559|
+------+----------+-------+-------+---------+
only showing top 5 rows



In [21]:
#df = spark.read.csv("PROCEDURES_ICD.csv", header=True, inferSchema=True)

#small_df = df.sample(fraction=0.1, seed=42).limit(400)

#small_df.write.csv("miniPROCEDURES_ICD.csv", mode="overwrite", header=True)

In [22]:
procedures = spark.read.csv("miniPROCEDURES_ICD.csv", header=True, inferSchema=True)
procedures.head()
print("Schema:")
procedures.printSchema()
print("Sample data:")
procedures.show(5)

Schema:
root
 |-- row_id: integer (nullable = true)
 |-- subject_id: integer (nullable = true)
 |-- hadm_id: integer (nullable = true)
 |-- seq_num: integer (nullable = true)
 |-- icd9_code: integer (nullable = true)

Sample data:
+------+----------+-------+-------+---------+
|row_id|subject_id|hadm_id|seq_num|icd9_code|
+------+----------+-------+-------+---------+
|  7354|     10130| 156668|      2|     4623|
|  9845|     10069| 146672|      5|     9604|
|  9847|     10069| 146672|      7|      331|
| 47340|     10006| 142345|      6|     9907|
| 51600|     10074| 170119|      4|     8847|
+------+----------+-------+-------+---------+
only showing top 5 rows



In [23]:
#df = spark.read.csv("INPUTEVENTS_MV.csv", header=True, inferSchema=True)

#small_df = df.sample(fraction=0.1, seed=42).limit(400)

#small_df.write.csv("miniINPUTEVENTS_MV.csv", mode="overwrite", header=True)

In [24]:
inputevents = spark.read.csv("miniINPUTEVENTS_MV.csv", header=True, inferSchema=True)
inputevents.head()
print("Schema:")
inputevents.printSchema()
print("Sample data:")
inputevents.show(5)

Schema:
root
 |-- ROW_ID: integer (nullable = true)
 |-- SUBJECT_ID: integer (nullable = true)
 |-- HADM_ID: integer (nullable = true)
 |-- ICUSTAY_ID: integer (nullable = true)
 |-- STARTTIME: timestamp (nullable = true)
 |-- ENDTIME: timestamp (nullable = true)
 |-- ITEMID: integer (nullable = true)
 |-- AMOUNT: double (nullable = true)
 |-- AMOUNTUOM: string (nullable = true)
 |-- RATE: double (nullable = true)
 |-- RATEUOM: string (nullable = true)
 |-- STORETIME: timestamp (nullable = true)
 |-- CGID: integer (nullable = true)
 |-- ORDERID: integer (nullable = true)
 |-- LINKORDERID: integer (nullable = true)
 |-- ORDERCATEGORYNAME: string (nullable = true)
 |-- SECONDARYORDERCATEGORYNAME: string (nullable = true)
 |-- ORDERCOMPONENTTYPEDESCRIPTION: string (nullable = true)
 |-- ORDERCATEGORYDESCRIPTION: string (nullable = true)
 |-- PATIENTWEIGHT: double (nullable = true)
 |-- TOTALAMOUNT: double (nullable = true)
 |-- TOTALAMOUNTUOM: string (nullable = true)
 |-- ISOPENBAG: inte

## 2. Data Preprocessing

### a. Choose columns to use

### b. Handle categorical columns (example)

In [25]:

categorical_cols = [col for col in df.columns if df.schema[col].dataType == StringType() and col != target_col]
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in categorical_cols]

NameError: name 'StringType' is not defined

### c. Assemble features (numeric + indexed categorical)

In [None]:
numeric_cols = [col for col in df.columns if col != target_col and col not in categorical_cols]
assembler = VectorAssembler(
    inputCols=numeric_cols + [col+"_index" for col in categorical_cols],
    outputCol="features_raw"
)

### d. Scale features (optional but recommended)

In [None]:
scaler = StandardScaler(inputCol="features_raw", outputCol="features")

## 3. Target variable preparation (if categorical)

In [None]:
label_indexer = StringIndexer(inputCol=target_col, outputCol="label")

## 4. Create Pipeline

In [None]:
pipeline_stages = indexers + [assembler, scaler, label_indexer]
preprocessing_pipeline = Pipeline(stages=pipeline_stages)

In [None]:
processed_data = preprocessing_pipeline.fit(df).transform(df)

## 5. Train-Test Split

In [None]:
train_data, test_data = processed_data.randomSplit([0.7, 0.3], seed=42)
print(f"Training count: {train_data.count()}")
print(f"Test count: {test_data.count()}")

## 6. Model Training

In [None]:
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label",
    numTrees=100,
    maxDepth=10,
    seed=42
)

# Train model
model = rf.fit(train_data)

## 7. Predictions

In [None]:
predictions = model.transform(test_data)

## 8. Evaluation

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="label")
auc = evaluator.evaluate(predictions)
print(f"Test AUC = {auc}")

In [None]:
# For multiclass classification
# evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
# accuracy = evaluator.evaluate(predictions)
# print(f"Test Accuracy = {accuracy}")

## 9. Close Session

In [None]:
spark.stop()