In [26]:
import pandas as pd

In [27]:
from pyspark.sql import SparkSession
from pathlib import Path

In [28]:
spark = SparkSession.builder.appName("HealthcareAnalysis").getOrCreate()


In [29]:
def read_csv_folders(folder_path, file_type):
    all_files = list(Path(folder_path).glob("*.csv"))
    dfs = []
    
    for file in all_files:
        pdf = pd.read_csv(file)
        pdf = pdf.loc[:, ~pdf.columns.str.contains('Unnamed')]
        pdf['year'] = file.stem
        pdf['patient_type'] = file_type
        dfs.append(pdf)
    combine = pd.concat(dfs, ignore_index=True)

    return spark.createDataFrame(combine)

In [None]:
inpatient_df = read_csv_folders('../datasets/Inpatient','inpatient')
outpatient_df = read_csv_folders('../datasets/Outpatient', 'outpatient')
mapping_df = spark.createDataFrame(pd.read_csv('../datasets/Mapping_Specialty.csv'))




In [None]:
inpatient_df.show()

+------------+--------------+--------------------+---------+-----------+-----------+-------------+-----+----------+------------+
|Archive_Date|Specialty_HIPE|      Specialty_Name|Case_Type|Adult_Child|Age_Profile|   Time_Bands|Total|      year|patient_type|
+------------+--------------+--------------------+---------+-----------+-----------+-------------+-----+----------+------------+
|  31-01-2018|             0|Small Volume Spec...|Inpatient|      Child|       0-15|   6-9 Months|    1|IN_WL 2018|   inpatient|
|  31-01-2018|             0|Small Volume Spec...|Inpatient|      Child|      16-64|  9-12 Months|    1|IN_WL 2018|   inpatient|
|  31-01-2018|           400|       Endocrinology| Day Case|      Child|       0-15|   3-6 Months|    1|IN_WL 2018|   inpatient|
|  31-01-2018|           400|       Endocrinology| Day Case|      Child|       0-15| 12-15 Months|    1|IN_WL 2018|   inpatient|
|  31-01-2018|           600|Otolaryngology (ENT)| Day Case|      Child|       0-15|   0-3 Months

In [None]:
inpatient_df.head(3)

[Row(Archive_Date='31-01-2018', Specialty_HIPE=0, Specialty_Name='Small Volume Specialities', Case_Type='Inpatient', Adult_Child='Child', Age_Profile='0-15', Time_Bands='  6-9 Months', Total=1, year='IN_WL 2018', patient_type='inpatient'),
 Row(Archive_Date='31-01-2018', Specialty_HIPE=0, Specialty_Name='Small Volume Specialities', Case_Type='Inpatient', Adult_Child='Child', Age_Profile='16-64', Time_Bands='  9-12 Months', Total=1, year='IN_WL 2018', patient_type='inpatient'),
 Row(Archive_Date='31-01-2018', Specialty_HIPE=400, Specialty_Name='Endocrinology', Case_Type='Day Case', Adult_Child='Child', Age_Profile='0-15', Time_Bands='  3-6 Months', Total=1, year='IN_WL 2018', patient_type='inpatient')]

In [None]:
inpatient_df.printSchema()

root
 |-- Archive_Date: string (nullable = true)
 |-- Specialty_HIPE: long (nullable = true)
 |-- Specialty_Name: string (nullable = true)
 |-- Case_Type: string (nullable = true)
 |-- Adult_Child: string (nullable = true)
 |-- Age_Profile: string (nullable = true)
 |-- Time_Bands: string (nullable = true)
 |-- Total: long (nullable = true)
 |-- year: string (nullable = true)
 |-- patient_type: string (nullable = true)



In [None]:
outpatient_df.printSchema()

root
 |-- Archive_Date: string (nullable = true)
 |-- Specialty_HIPE: double (nullable = true)
 |-- Speciality: string (nullable = true)
 |-- Adult_Child: string (nullable = true)
 |-- Age_Profile: string (nullable = true)
 |-- Time_Bands: string (nullable = true)
 |-- Total: long (nullable = true)
 |-- year: string (nullable = true)
 |-- patient_type: string (nullable = true)



In [None]:
#thing to remeber that out and in are diffrent in term of number of column
# the coulumn 'Case type' is missing in the out
# that an issue only if i want to union these tables because i'll need to drop this coulum
# and also unite the names of the columns like 'Speciality' and 'Speciality_name'