# Tentativa #1

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier  # or any other algorithm
from pyspark.ml.evaluation import BinaryClassificationEvaluator  # or MulticlassClassificationEvaluator

## 0. Initialize Spark Session

In [56]:
spark = SparkSession.builder \
    .appName("BigDataMLProject") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

## 1. Load Data

## 2. Data Preprocessing

The tables were chose based on its properties regarding a persons' illness, bodily atributes or gravity of the situation. The tables were:
* ChartEVents
* Admissions
* Diagnoses
* ICUStays
* InputEvents_MV
* Procedures

### a. Feature Engineering

The majoriry of the atributes are not useful to predict the duration of a ICU stay, therefore, for a initial analysis, we will choose the ones we consider relevant.

First of all, we create temporary views of all tables into a SQL-like table.

In [74]:
chartevents.createOrReplaceTempView("chartevents")
admissions.createOrReplaceTempView("admissions")
diagnoses.createOrReplaceTempView("diagnoses")
icustays.createOrReplaceTempView("icustays")
inputevents.createOrReplaceTempView("inputevents")
labevents.createOrReplaceTempView("labevents")
procedures.createOrReplaceTempView("procedures")

SO QUEREMOS PESSOAS QUE TENHAM ICUSTAY_ID PQ: given a person admitted int he icu i have to predict for how long they are gonna be there, therefore in pre processing i will ignore the data from people that were never in the icu. The most efficient approach is to create temporary views that pre-filter the data before joining tables.

In [75]:
spark.sql("CREATE OR REPLACE TEMPORARY VIEW icu_filtered AS SELECT * FROM icustays WHERE icustay_id IS NOT NULL")
spark.sql("CREATE OR REPLACE TEMPORARY VIEW chart_filtered AS SELECT * FROM chartevents WHERE icustay_id IS NOT NULL")
spark.sql("CREATE OR REPLACE TEMPORARY VIEW input_filtered AS SELECT * FROM inputevents WHERE icustay_id IS NOT NULL")

DataFrame[]

To get procedures done to icu patients

In [None]:
spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW proc_filtered AS
SELECT 
    p.icd9_code,
    icu.icustay_id
FROM procedures p
JOIN icustays icu ON p.hadm_id = icu.hadm_id
""")

++
||
++
++



To handle labevents time to be only during the icustay

In [77]:
spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW lab_filtered AS
SELECT 
    lab.subject_id,
    lab.hadm_id,
    lab.itemid,
    lab.valuenum,
    lab.valueuom,
    icu.icustay_id 
FROM labevents lab
JOIN admissions adm ON lab.hadm_id = adm.hadm_id
JOIN icustays icu ON adm.hadm_id = icu.hadm_id
WHERE 
    lab.charttime BETWEEN icu.intime AND icu.outtime
    AND lab.hadm_id IS NOT NULL
""")

DataFrame[]

Then, we create a table, using a SQL query, with the relevant features for predicting the icu stay duration, and its own column (LOS from ICUStays).

In [78]:
df = spark.sql("""
    SELECT 
        icu.icustay_id,
        adm.admission_type,
        adm.admission_location,
        chart.itemid AS chart_itemid,
        chart.valuenum AS chart_value,
        chart.error,
        lab.itemid AS lab_itemid,
        lab.valuenum AS lab_value,
        diag.seq_num,
        diag.icd9_code as diagnose_code,
        input.itemid AS input_itemid,
        input.amount,
        input.rate,
        input.patientweight,
        proc.icd9_code as procedure_code,
        icu.first_careunit,
        icu.LOS
    FROM icu_filtered icu
    JOIN admissions adm ON icu.hadm_id = adm.hadm_id
    LEFT JOIN chart_filtered chart ON icu.icustay_id = chart.icustay_id
    LEFT JOIN lab_filtered lab ON icu.icustay_id = lab.icustay_id
    LEFT JOIN diagnoses diag ON icu.hadm_id = diag.hadm_id
    LEFT JOIN input_filtered input ON icu.icustay_id = input.icustay_id
    LEFT JOIN proc_filtered proc ON proc.icustay_id=icu.icustay_id
""")
df.show()

+----------+--------------+--------------------+------------+-----------+-----+----------+---------+-------+-------------+------------+------+----+-------------+--------------+--------------+-------+
|icustay_id|admission_type|  admission_location|chart_itemid|chart_value|error|lab_itemid|lab_value|seq_num|diagnose_code|input_itemid|amount|rate|patientweight|procedure_code|first_careunit|    LOS|
+----------+--------------+--------------------+------------+-----------+-----+----------+---------+-------+-------------+------------+------+----+-------------+--------------+--------------+-------+
|    285977|        URGENT|TRANSFER FROM HOS...|        NULL|       NULL| NULL|      NULL|     NULL|   NULL|         NULL|        NULL|  NULL|NULL|         NULL|          NULL|          CSRU| 0.8181|
|    279205|     EMERGENCY|EMERGENCY ROOM ADMIT|        NULL|       NULL| NULL|      NULL|     NULL|   NULL|         NULL|        NULL|  NULL|NULL|         NULL|          NULL|         TSICU| 6.6602|


merdas para justificar no relatorio:

Juntei as tabelas icu e admissions por admissao ao hospital e nao por pessoa, pq pode haver mais doq uma admissao por pessoa. Fiz outter join pelo msm motivo, mas dps tenho de testar isso com os dados todos e nao so com as tabelas parciais

na chartevents, juntei por icu stay, Each row associated with one ITEMID (e.g. 212) corresponds to an instantiation of the same measurement (e.g. heart rate) entao nao vale a pena por a coluna VALUEUOM is the unit of measurement, pq cada teste ja e feito numa unidade de medida, n vale a pena tar a dar mais do msm a ml. Vou por o erro pq se houve erro ent conta menos para a ml mas ig que e melhor q nada??? problema para a ml e nao para nos.

aqui cada chartevent, ou seja, analie/teste/raiox etc corresponde as uma linha, mais tarde temos de resolver este problema pq o objetivo e prever o tempo de internacao POR ICUSTAY_ID, ent deviamos ter apenas uma linha por admissao//icustay

na tabela dos diagnosticos SEQ_NUM provides the order in which the ICD diagnoses relate to the patient, o quao importante a doenca e no caso da pessoa, e ICD9_CODE contains the actual code corresponding to the diagnosis, ou seja a doenca id, por isso para dar join dou por paciente ou por admissao? aceito opinioes, por agr pus admissoes pq e oq pus no resto

input events dei join pela icustay ja que so queremos analisar pacientes que ja tenham estado. Each row associated with one ITEMID which corresponds to an instantiation of the same measurement (e.g. norepinephrine) AMOUNT - the amount of a drug or substance administered to the patient (ignorando se esta em ml dl ou l pelo motivo referido acima)


When predicting ICU length of stay (LOS), incorporating lab results can significantly improve your model's performance, but you're right that joining requires careful handling since LABEVENTS doesn't directly contain ICU stay IDs. juntar pelo ham id ou subject e dps escolher com base no tempo


the id is the icustays

justificar o left join:
For your use case, keep the LEFT JOINs but understand why:

    LEFT JOIN (icu → others) is correct because:

        You want all ICU stays (base table)

        You want to keep ICU stays even if they're missing some diagnoses/procedures

    Don't use FULL OUTER JOIN because:

        It would include diagnoses/procedures for non-ICU patients (if any exist)

        Could create NULL ICU stay records which you don't want

    Your current approach is good for:

        One row per combination of ICU stay + diagnosis + procedure

        Preserving all relationships

Example of What You'll Get

For a patient with:

    1 ICU stay

    4 diagnoses

    5 procedures

    10 lab results

Your query will produce 4 × 5 × 10 = 200 rows for this patient (all combinations).


FALTA:
altura / bmi do paciente
genero do paciente

Attribute/column documentation for the table created:

 1. Patient Identification
- **icustay_id**: Unique ICU stay identifier (primary key for ICU stays)

 2. Admission Information
- **admission_type**: Type of admission (ELECTIVE, EMERGENCY, URGENT, etc.)
- **admission_location**: Source of admission (TRANSFER FROM HOSPITAL, CLINIC REFERRAL, etc.)
- **first_careunit**: Initial ICU care unit (MICU, SICU, CSRU, etc.)
- **LOS**: Length of stay in ICU (in hours or days)

 3. Clinical Measurements
- **chart_itemid**: Identifier for charted measurement (foreign key to D_ITEMS)
- **chart_value**: Numeric value of the clinical measurement
- **chart_error**: Error flag for the measurement (if exists)
- **lab_itemid**: Identifier for laboratory test (foreign key to D_ITEMS)
- **lab_value**: Result value of the laboratory test

 4. Diagnostic Information
- **diagnose_code**: ICD-9 diagnosis code
- **seq_num**: Priority/sequence number of the diagnosis (1=primary)

 5. Treatment Information
- **input_itemid**: Identifier for input event (medications/fluids)
- **input_amount**: Quantity administered
- **input_rate**: Rate of administration
- **input_patientweight**: Patient weight at time of input (if recorded)

 6. Procedural Information
- **procedure_code**: ICD-9 procedure code performed during stay

transform your multiple rows per ICU stay into a single row format suitable for ML modeling while preserving both continuous values and procedure/diagnosis information

### a partir daqui preciso de dados bons para nao estar a fazer ghostcode

Handle LabEvents

In [None]:
df.cache()

df.createOrReplaceTempView("icu_data")
lab_features = spark.sql("""
    SELECT 
        icd9_code,
        AVG(lab_value) as lab_value
    FROM icu_data
    WHERE lab_itemid IS NOT NULL
    GROUP BY icustay_id, lab_itemid
""")
lab_features.show()

25/05/16 19:06:03 WARN CacheManager: Asked to cache already cached data.


+----------+----------+---------+
|icustay_id|lab_itemid|lab_value|
+----------+----------+---------+
+----------+----------+---------+



Handle Procedures

In [None]:
common_procs = spark.sql("""
SELECT 
    procedure_code,
    COUNT(*) as freq
FROM icu_data
WHERE procedure_code IS NOT NULL
GROUP BY procedure_code
ORDER BY freq DESC
LIMIT 100  
""")

common_procs = spark.sql("""
SELECT 
    procedure_code,
    COUNT(*) as freq
FROM icu_data
WHERE procedure_code IS NOT NULL
GROUP BY procedure_code
ORDER BY freq DESC
LIMIT 100  
""")

common_procs.show()

+--------------+----+
|procedure_code|freq|
+--------------+----+
+--------------+----+



In [None]:
proc_features = df.select("icustay_id").distinct()
for proc in common_procs:
    proc_code = proc['procedure_code']
    has_proc = df.filter(F.col("procedure_code") == proc_code) \
                .select("icustay_id").distinct() \
                .withColumn(f"proc_{proc_code}", F.lit(1))
    
    proc_features = proc_features.join(
        has_proc, 
        on="icustay_id", 
        how="left"
    ).fillna(0, subset=[f"proc_{proc_code}"])

proc_features.show()

### b. Handle categorical columns (example)

In [None]:

categorical_cols = [col for col in df.columns if df.schema[col].dataType == StringType() and col != target_col]
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in categorical_cols]

NameError: name 'StringType' is not defined

### c. Assemble features (numeric + indexed categorical)

In [None]:
numeric_cols = [col for col in df.columns if col != target_col and col not in categorical_cols]
assembler = VectorAssembler(
    inputCols=numeric_cols + [col+"_index" for col in categorical_cols],
    outputCol="features_raw"
)

### d. Scale features (optional but recommended)

In [None]:
scaler = StandardScaler(inputCol="features_raw", outputCol="features")

## 3. Target variable preparation (if categorical)

In [None]:
label_indexer = StringIndexer(inputCol=target_col, outputCol="label")

## 4. Create Pipeline

In [None]:
pipeline_stages = indexers + [assembler, scaler, label_indexer]
preprocessing_pipeline = Pipeline(stages=pipeline_stages)

In [None]:
processed_data = preprocessing_pipeline.fit(df).transform(df)

## 5. Train-Test Split

In [None]:
train_data, test_data = processed_data.randomSplit([0.7, 0.3], seed=42)
print(f"Training count: {train_data.count()}")
print(f"Test count: {test_data.count()}")

## 6. Model Training

In [None]:
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label",
    numTrees=100,
    maxDepth=10,
    seed=42
)

# Train model
model = rf.fit(train_data)

## 7. Predictions

In [None]:
predictions = model.transform(test_data)

## 8. Evaluation

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="label")
auc = evaluator.evaluate(predictions)
print(f"Test AUC = {auc}")

In [None]:
# For multiclass classification
# evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
# accuracy = evaluator.evaluate(predictions)
# print(f"Test Accuracy = {accuracy}")

## 9. Close Session

In [None]:
spark.stop()