In [26]:
import pandas as pd

In [27]:
from pyspark.sql import SparkSession
from pathlib import Path

In [28]:
spark = SparkSession.builder.appName("HealthcareAnalysis").getOrCreate()


In [29]:
def read_csv_folders(folder_path, file_type):
    all_files = list(Path(folder_path).glob("*.csv"))
    dfs = []
    
    for file in all_files:
        pdf = pd.read_csv(file)
        pdf = pdf.loc[:, ~pdf.columns.str.contains('Unnamed')]
        pdf['year'] = file.stem
        pdf['patient_type'] = file_type
        dfs.append(pdf)
    combine = pd.concat(dfs, ignore_index=True)

    return spark.createDataFrame(combine)

In [30]:
inpatient_df = read_csv_folders('../datasets/Inpatient','inpatient')
outpatient_df = read_csv_folders('../datasets/Outpatient', 'outpatient')
mapping_df = spark.createDataFrame(pd.read_csv('../datasets/Mapping_Specialty.csv'))




In [31]:
inpatient_df.show()

+------------+--------------+--------------------+---------+-----------+-----------+-------------+-----+----------+------------+
|Archive_Date|Specialty_HIPE|      Specialty_Name|Case_Type|Adult_Child|Age_Profile|   Time_Bands|Total|      year|patient_type|
+------------+--------------+--------------------+---------+-----------+-----------+-------------+-----+----------+------------+
|  31-01-2018|             0|Small Volume Spec...|Inpatient|      Child|       0-15|   6-9 Months|    1|IN_WL 2018|   inpatient|
|  31-01-2018|             0|Small Volume Spec...|Inpatient|      Child|      16-64|  9-12 Months|    1|IN_WL 2018|   inpatient|
|  31-01-2018|           400|       Endocrinology| Day Case|      Child|       0-15|   3-6 Months|    1|IN_WL 2018|   inpatient|
|  31-01-2018|           400|       Endocrinology| Day Case|      Child|       0-15| 12-15 Months|    1|IN_WL 2018|   inpatient|
|  31-01-2018|           600|Otolaryngology (ENT)| Day Case|      Child|       0-15|   0-3 Months

In [32]:
inpatient_df.head(3)

[Row(Archive_Date='31-01-2018', Specialty_HIPE=0, Specialty_Name='Small Volume Specialities', Case_Type='Inpatient', Adult_Child='Child', Age_Profile='0-15', Time_Bands='  6-9 Months', Total=1, year='IN_WL 2018', patient_type='inpatient'),
 Row(Archive_Date='31-01-2018', Specialty_HIPE=0, Specialty_Name='Small Volume Specialities', Case_Type='Inpatient', Adult_Child='Child', Age_Profile='16-64', Time_Bands='  9-12 Months', Total=1, year='IN_WL 2018', patient_type='inpatient'),
 Row(Archive_Date='31-01-2018', Specialty_HIPE=400, Specialty_Name='Endocrinology', Case_Type='Day Case', Adult_Child='Child', Age_Profile='0-15', Time_Bands='  3-6 Months', Total=1, year='IN_WL 2018', patient_type='inpatient')]

In [33]:
inpatient_df.printSchema()

root
 |-- Archive_Date: string (nullable = true)
 |-- Specialty_HIPE: long (nullable = true)
 |-- Specialty_Name: string (nullable = true)
 |-- Case_Type: string (nullable = true)
 |-- Adult_Child: string (nullable = true)
 |-- Age_Profile: string (nullable = true)
 |-- Time_Bands: string (nullable = true)
 |-- Total: long (nullable = true)
 |-- year: string (nullable = true)
 |-- patient_type: string (nullable = true)



In [34]:
outpatient_df.printSchema()

root
 |-- Archive_Date: string (nullable = true)
 |-- Specialty_HIPE: double (nullable = true)
 |-- Speciality: string (nullable = true)
 |-- Adult_Child: string (nullable = true)
 |-- Age_Profile: string (nullable = true)
 |-- Time_Bands: string (nullable = true)
 |-- Total: long (nullable = true)
 |-- year: string (nullable = true)
 |-- patient_type: string (nullable = true)



In [35]:
#thing to remeber that out and in are diffrent in term of number of column
# the coulumn 'Case type' is missing in the out
# that an issue only if i want to union these tables because i'll need to drop this coulum
# and also unite the names of the columns like 'Speciality' and 'Speciality_name'

In [41]:
outpatient_df[['year','Speciality']].show()

+----------+-----------+
|      year| Speciality|
+----------+-----------+
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018| Cardiology|
|Op_WL 2018|Dermatology|
|Op_WL 2018|Dermatology|
|Op_WL 2018|Dermatology|
|Op_WL 2018|Dermatology|
|Op_WL 2018|Dermatology|
|Op_WL 2018|Dermatology|
+----------+-----------+
only showing top 20 rows


In [44]:
inpatient_df.select('case_type','Total').show()

+---------+-----+
|case_type|Total|
+---------+-----+
|Inpatient|    1|
|Inpatient|    1|
| Day Case|    1|
| Day Case|    1|
| Day Case|   14|
| Day Case|    2|
| Day Case|    1|
| Day Case|    2|
| Day Case|    2|
|Inpatient|   44|
|Inpatient|   12|
|Inpatient|    5|
|Inpatient|    2|
|Inpatient|    1|
| Day Case|    1|
|Inpatient|    1|
|Inpatient|    1|
|Inpatient|    1|
| Day Case|    1|
|Inpatient|    2|
+---------+-----+
only showing top 20 rows


In [48]:
outpatient_df.select('Total','Specialty_HIPE').describe().show()

+-------+------------------+--------------+
|summary|             Total|Specialty_HIPE|
+-------+------------------+--------------+
|  count|            270983|        270983|
|   mean| 80.21071063498448|           NaN|
| stddev|148.61537343757712|           NaN|
|    min|                 1|           0.0|
|    max|              4239|           NaN|
+-------+------------------+--------------+



In [53]:
outpatient_df[outpatient_df['Specialty_HIPE']> 500].show()

+------------+--------------+--------------------+-----------+-----------+------------+-----+----------+------------+
|Archive_Date|Specialty_HIPE|          Speciality|Adult_Child|Age_Profile|  Time_Bands|Total|      year|patient_type|
+------------+--------------+--------------------+-----------+-----------+------------+-----+----------+------------+
|  31-01-2018|         600.0|Otolaryngology (ENT)|      Child|       0-15|  0-3 Months|  467|Op_WL 2018|  outpatient|
|  31-01-2018|         600.0|Otolaryngology (ENT)|      Child|       0-15|  3-6 Months|  365|Op_WL 2018|  outpatient|
|  31-01-2018|         600.0|Otolaryngology (ENT)|      Child|       0-15|  6-9 Months|  443|Op_WL 2018|  outpatient|
|  31-01-2018|         600.0|Otolaryngology (ENT)|      Child|       0-15| 9-12 Months|  486|Op_WL 2018|  outpatient|
|  31-01-2018|         600.0|Otolaryngology (ENT)|      Child|       0-15|12-15 Months|  364|Op_WL 2018|  outpatient|
|  31-01-2018|         600.0|Otolaryngology (ENT)|      

In [54]:
from pyspark.sql.functions import col, isnan, when, count

In [56]:
outpatient_df.select([
    count(when(isnan(c) | col(c).isNull(), c )).alias(c)
    for c in ['Total','Specialty_HIPE']
]).show()

+-----+--------------+
|Total|Specialty_HIPE|
+-----+--------------+
|    0|           191|
+-----+--------------+



In [61]:
for c in outpatient_df['Total','Specialty_HIPE']:
    print(type(c))

<class 'pyspark.sql.classic.column.Column'>
<class 'pyspark.sql.classic.column.Column'>
