In [1]:
import pandas as pd
import os


from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf


from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, DoubleType

from IPython.display import display, HTML

from util import spark_helpers


In [6]:
def main():
    # start-up spark
    
    conf = SparkConf()  # create the configuration
    conf.setMaster("local")
    # conf.set("spark.jars", "/Users/yixiangzhang/Desktop/postgresql-42.4.1.jar")
    # conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
    spark = SparkSession.builder\
                        .config(conf = conf)\
                        .appName('test').getOrCreate()
    sc = SparkContext.getOrCreate()

    directory = "./silver_data/"
    filepathes = []
    for filename in os.listdir(directory):
        if filename[-4:] == ".csv": continue

        filepathes.append(os.path.join(directory,filename))

    mappers_pandas_types_to_spark_types = {
        "int": IntegerType(),
        "string": StringType(),
        "datetime": TimestampType(),
        "float": DoubleType()
    }
    
    for i,filepath in enumerate(filepathes):
        
        print(f"{filepath}: start")
        if i == 3: continue
        if filepath == "./silver_data/PRESCRIPTIONS.parquet": continue
        
        df = pd.read_parquet(filepath)
        
        mimic3_path = df["mimic3_filepath"].iloc[0]
        mimic3_selected = df["column_mimic3"].to_list()
        mimic4_path = df["mimic4_filepath"].iloc[0]
        mimic4_selected = df["column_mimic4_candidate_1"].to_list()
        dtypes = df["dtype_mimic3"].to_list()

        df_m3 = spark.read.csv(mimic3_path, 
                       header=True,
                       inferSchema=False)

        df_m3 = spark_helpers.rename_columns(data = df_m3,
                                from_cols=mimic3_selected,
                                to_cols=mimic4_selected)

        df_m3_selected = df_m3.select(mimic4_selected)

        transfers_schema = spark_helpers.create_schema(pandas_types =df["dtype_mimic3"].to_list(),
                                                        mappers = mappers_pandas_types_to_spark_types,
                                                        column_names=mimic4_selected)
        
        df_m3_casted = spark_helpers.cast_schema(df = df_m3_selected,
                                schema = transfers_schema
                                )
        
        df_m4 = spark.read.csv(mimic4_path, 
                       header=True,
                       inferSchema=False)

        df_m4_selected = df_m4.select(mimic4_selected)
        df_m4_casted = spark_helpers.cast_schema(df = df_m4_selected,
                                schema = transfers_schema
                                )

        df_transfer_merged = df_m4_casted.coalesce(1).union(df_m3_casted.coalesce(1))

        spark_helpers.write_to_db(data_frame=df_transfer_merged,
                        table_name=os.path.splitext(os.path.basename(filepath))[0],
                        db_name="mimic",
                        db_usrname="mimic",
                        db_pssword="mimic",
                        port=6432)
        print(f"{filepath}: end")
if __name__ == "__main__":
    main()


./silver_data/DIAGNOSES_ICD.parquet: start
./silver_data/DIAGNOSES_ICD.parquet: end
./silver_data/CHARTEVENTS.parquet: start


                                                                                

./silver_data/CHARTEVENTS.parquet: end
./silver_data/D_LABITEMS.parquet: start
./silver_data/D_LABITEMS.parquet: end
./silver_data/PROCEDUREEVENTS_MV.parquet: start
./silver_data/ADMISSIONS.parquet: start
./silver_data/ADMISSIONS.parquet: end
./silver_data/TRANSFERS.parquet: start
./silver_data/TRANSFERS.parquet: end
./silver_data/DATETIMEEVENTS.parquet: start
./silver_data/DATETIMEEVENTS.parquet: end
./silver_data/PATIENTS.parquet: start
./silver_data/PATIENTS.parquet: end
./silver_data/PROCEDURES_ICD.parquet: start
./silver_data/PROCEDURES_ICD.parquet: end
./silver_data/SERVICES.parquet: start
./silver_data/SERVICES.parquet: end
./silver_data/INPUTEVENTS_CV.parquet: start


                                                                                

./silver_data/INPUTEVENTS_CV.parquet: end
./silver_data/MICROBIOLOGYEVENTS.parquet: start
./silver_data/MICROBIOLOGYEVENTS.parquet: end
./silver_data/D_ITEMS.parquet: start
./silver_data/D_ITEMS.parquet: end
./silver_data/LABEVENTS.parquet: start


                                                                                

./silver_data/LABEVENTS.parquet: end
./silver_data/DRGCODES.parquet: start
./silver_data/DRGCODES.parquet: end
./silver_data/INPUTEVENTS_MV.parquet: start


                                                                                

./silver_data/INPUTEVENTS_MV.parquet: end
./silver_data/ICUSTAYS.parquet: start
./silver_data/ICUSTAYS.parquet: end
./silver_data/PRESCRIPTIONS.parquet: start
./silver_data/D_ICD_PROCEDURES.parquet: start


                                                                                

./silver_data/D_ICD_PROCEDURES.parquet: end
./silver_data/OUTPUTEVENTS.parquet: start
./silver_data/OUTPUTEVENTS.parquet: end
./silver_data/D_ICD_DIAGNOSES.parquet: start


[Stage 199:>                                                        (0 + 1) / 2]

./silver_data/D_ICD_DIAGNOSES.parquet: end


                                                                                