In [24]:
import pandas as pd
import os


from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf


from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, DoubleType

from IPython.display import display, HTML

from util import spark_helpers

In [21]:
display(HTML("<style>.container { width:100% !important; }</style>"))
conf = SparkConf()  # create the configuration
conf.setMaster("local")
# conf.set("spark.jars", "/Users/yixiangzhang/Desktop/postgresql-42.4.1.jar")
# conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark = SparkSession.builder\
                    .config(conf = conf)\
                    .appName('test').getOrCreate()
sc = SparkContext.getOrCreate()

In [22]:
spark

In [23]:
directory = "./silver_data/"
filepathes = []
for filename in os.listdir(directory):
    if filename[-4:] == ".csv": continue
    
    filepathes.append(os.path.join(directory,filename))

filepathes
    

['./silver_data/DIAGNOSES_ICD.parquet',
 './silver_data/CHARTEVENTS.parquet',
 './silver_data/D_LABITEMS.parquet',
 './silver_data/PROCEDUREEVENTS_MV.parquet',
 './silver_data/ADMISSIONS.parquet',
 './silver_data/TRANSFERS.parquet',
 './silver_data/DATETIMEEVENTS.parquet',
 './silver_data/PATIENTS.parquet',
 './silver_data/PROCEDURES_ICD.parquet',
 './silver_data/SERVICES.parquet',
 './silver_data/INPUTEVENTS_CV.parquet',
 './silver_data/MICROBIOLOGYEVENTS.parquet',
 './silver_data/D_ITEMS.parquet',
 './silver_data/LABEVENTS.parquet',
 './silver_data/DRGCODES.parquet',
 './silver_data/INPUTEVENTS_MV.parquet',
 './silver_data/ICUSTAYS.parquet',
 './silver_data/PRESCRIPTIONS.parquet',
 './silver_data/D_ICD_PROCEDURES.parquet',
 './silver_data/OUTPUTEVENTS.parquet',
 './silver_data/D_ICD_DIAGNOSES.parquet']

In [27]:
os.path.splitext(os.path.basename(filepathes[-4]))[0]

'PRESCRIPTIONS'

In [28]:
mappers_pandas_types_to_spark_types = {
    "int": IntegerType(),
    "string": StringType(),
    "datetime": TimestampType(),
    "float": DoubleType()
}

In [29]:
df = pd.read_parquet(filepathes[0])

mimic3_path = df["mimic3_filepath"].iloc[0]
mimic3_selected = df["column_mimic3"].to_list()
mimic4_path = df["mimic4_filepath"].iloc[0]
mimic4_selected = df["column_mimic4_candidate_1"].to_list()
dtypes = df["dtype_mimic3"].to_list()

print(mimic3_path)

../basic_filtered_data/mimic-iii-demo/DIAGNOSES_ICD.csv


In [30]:
df_m3 = spark.read.csv(mimic3_path, 
                       header=True,
                       inferSchema=False)
df_m3.show()

+---+------+----------+-------+-------+---------+
|_c0|row_id|subject_id|hadm_id|seq_num|icd9_code|
+---+------+----------+-------+-------+---------+
|  0|112344|     10006| 142345|      1|    99591|
|  1|112345|     10006| 142345|      2|    99662|
|  2|112346|     10006| 142345|      3|     5672|
|  3|112347|     10006| 142345|      4|    40391|
|  4|112348|     10006| 142345|      5|    42731|
|  5|112349|     10006| 142345|      6|     4280|
|  6|112350|     10006| 142345|      7|     4241|
|  7|112351|     10006| 142345|      8|     4240|
|  8|112352|     10006| 142345|      9|     2874|
|  9|112353|     10006| 142345|     10|    03819|
| 10|112354|     10006| 142345|     11|     7850|
| 11|112355|     10006| 142345|     12|    E8791|
| 12|112356|     10006| 142345|     13|     V090|
| 13|112357|     10006| 142345|     14|    56211|
| 14|112358|     10006| 142345|     15|    28529|
| 15|112359|     10006| 142345|     16|    25000|
| 16|112360|     10006| 142345|     17|    V5867|


23/06/01 09:29:35 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , row_id, subject_id, hadm_id, seq_num, icd9_code
 Schema: _c0, row_id, subject_id, hadm_id, seq_num, icd9_code
Expected: _c0 but found: 
CSV file: file:///Users/yixiangzhang/Documents/AWS_SAA_C03/basic_filtered_data/mimic-iii-demo/DIAGNOSES_ICD.csv


In [9]:
df_m3 = spark_helpers.rename_columns(data = df_m3,
                        from_cols=mimic3_selected,
                        to_cols=mimic4_selected)

df_m3_selected = df_m3.select(mimic4_selected)


df_m3_selected.show()

+----------+-------+-------+--------+
|subject_id|hadm_id|seq_num|icd_code|
+----------+-------+-------+--------+
|     10006| 142345|      1|   99591|
|     10006| 142345|      2|   99662|
|     10006| 142345|      3|    5672|
|     10006| 142345|      4|   40391|
|     10006| 142345|      5|   42731|
|     10006| 142345|      6|    4280|
|     10006| 142345|      7|    4241|
|     10006| 142345|      8|    4240|
|     10006| 142345|      9|    2874|
|     10006| 142345|     10|   03819|
|     10006| 142345|     11|    7850|
|     10006| 142345|     12|   E8791|
|     10006| 142345|     13|    V090|
|     10006| 142345|     14|   56211|
|     10006| 142345|     15|   28529|
|     10006| 142345|     16|   25000|
|     10006| 142345|     17|   V5867|
|     10006| 142345|     18|   E9342|
|     10006| 142345|     19|   41401|
|     10006| 142345|     20|    2749|
+----------+-------+-------+--------+
only showing top 20 rows



In [31]:
transfers_schema = spark_helpers.create_schema(pandas_types =df["dtype_mimic3"].to_list(),
                                                mappers = mappers_pandas_types_to_spark_types,
                                                column_names=mimic4_selected)

for item in transfers_schema:
    print(item)

StructField('subject_id', IntegerType(), True)
StructField('hadm_id', IntegerType(), True)
StructField('seq_num', IntegerType(), True)
StructField('icd_code', StringType(), True)


In [32]:
df_m3_casted = spark_helpers.cast_schema(df = df_m3_selected,
                           schema = transfers_schema
                           )
df_m3_casted.printSchema()

root
 |-- subject_id: integer (nullable = true)
 |-- hadm_id: integer (nullable = true)
 |-- seq_num: integer (nullable = true)
 |-- icd_code: string (nullable = true)



In [33]:
df_m4 = spark.read.csv(mimic4_path, 
                       header=True,
                       inferSchema=False)

df_m4_selected = df_m4.select(mimic4_selected)
df_m4_casted = spark_helpers.cast_schema(df = df_m4_selected,
                           schema = transfers_schema
                           )
df_m4_casted.printSchema()

root
 |-- subject_id: integer (nullable = true)
 |-- hadm_id: integer (nullable = true)
 |-- seq_num: integer (nullable = true)
 |-- icd_code: string (nullable = true)



In [34]:
df_transfer_merged = df_m4_casted.coalesce(1).union(df_m3_casted.coalesce(1))


In [35]:
spark_helpers.write_to_db(data_frame=df_transfer_merged,
                        table_name=os.path.splitext(os.path.basename(filepathes[-4]))[0],
                        db_name="mimic",
                        db_usrname="mimic",
                        db_pssword="mimic",
                        port=6432)