In [49]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#  Change 'root' to the name of your user; '12345' to the password of your connection, 'seriousmd' to the name of your schema
engine = create_engine('mysql://root:12345@localhost/seriousmd')

# Data Cleaning
Please put each data cleaning method you used below for each table



## Appointments




In [50]:
appointment_df = pd.read_csv('../dataset/appointments.csv', encoding='latin1')

In [51]:
# firstly, check the data type of the columns
print(appointment_df.dtypes)
print(appointment_df.size)

pxid          object
clinicid      object
doctorid      object
apptid        object
status        object
TimeQueued    object
QueueDate     object
StartTime     object
EndTime       object
type          object
Virtual       object
dtype: object
107282252


In [52]:
# convert columns to their respective data types
appointment_df['pxid'] = appointment_df['pxid'].astype(str)
appointment_df['clinicid'] = appointment_df['clinicid'].astype(str)
appointment_df['doctorid'] = appointment_df['doctorid'].astype(str)
appointment_df['apptid'] = appointment_df['apptid'].astype(str)
appointment_df['status'] = appointment_df['status'].astype(str)
appointment_df['TimeQueued'] = pd.to_datetime(appointment_df['TimeQueued'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
appointment_df['QueueDate'] = pd.to_datetime(appointment_df['QueueDate'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
appointment_df['StartTime'] = pd.to_datetime(appointment_df['StartTime'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
appointment_df['EndTime'] = pd.to_datetime(appointment_df['EndTime'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
appointment_df['type'] = appointment_df['type'].astype(str)
appointment_df['Virtual'] = pd.to_numeric(appointment_df['Virtual'], errors='coerce').astype('boolean')

In [53]:
# rename column virtual to isVirtual
appointment_df.rename(columns={'Virtual': 'isVirtual'}, inplace=True)

In [54]:
# turn status complete to completed
pd.unique(appointment_df['status'])

array(['Complete', 'Queued', 'NoShow', 'Serving', 'Cancel', 'Skip',
       'Completed', 'Admitted'], dtype=object)

In [55]:
# Replace 'complete' with 'completed' in the 'Status' column
appointment_df['status'] = appointment_df['status'].replace(
    'Complete', 'Completed')

In [56]:
print(pd.unique(appointment_df['status']))

['Completed' 'Queued' 'NoShow' 'Serving' 'Cancel' 'Skip' 'Admitted']


In [57]:
# remove rows with apptid as null
appointment_df = appointment_df[appointment_df['apptid'].notnull()]

In [58]:
print(appointment_df.size)

107282252


## Patient (px)

In [59]:
px_df = pd.read_csv('../dataset/px.csv', encoding='latin1', low_memory=False)

In [60]:
print(px_df.dtypes)
print(px_df.size)

pxid      object
age       object
gender    object
dtype: object
19523439


In [61]:
px_df['pxid'] = px_df['pxid'].astype(str)
px_df['age'] = pd.to_numeric(px_df['age'], errors='coerce').astype('Int64')
px_df['gender'] = px_df['gender'].astype(str)

In [62]:
# 1. delete row where data is pxid, age, gender
result_df = px_df[px_df['pxid'] == 'pxid']

# Display the result

print(result_df)

        pxid   age  gender
995328  pxid  <NA>  gender


In [63]:
# Create a boolean mask for the rows to be deleted
mask = (px_df['pxid'] == 'pxid')

# Use the boolean mask to drop the rows that meet the conditions
px_df.drop(px_df[mask].index, inplace=True)

# Optionally, you can reset the index after dropping rows
px_df.reset_index(drop=True, inplace=True)

In [64]:
result_df = px_df[px_df['pxid'] == 'pxid']

# Display the result
print(result_df)

Empty DataFrame
Columns: [pxid, age, gender]
Index: []


In [65]:
# 2. turn all ages below 0 into null 
print(px_df[px_df['age'] < 0])

                                     pxid   age  gender
3224     EC7168F4DF42E718CA4A70F52E57A99B  -182    MALE
5883     8CC44C76FDAAC6C6F63BCFFA7D6D035B   -24  FEMALE
12133    C1CA856AD536A5271D627B1C2D3035E5  -962  FEMALE
13506    4C6D650B3DF986431FB3E8E73B25E71B    -9    MALE
18165    D896D4A82EFD3D4B3DCFCFA68F1B4178   -20  FEMALE
...                                   ...   ...     ...
6446113  B55530B08612FCE658AB4FFE4FD339B4    -2  FEMALE
6483470  AB033EC5325213D763D23F08DFBCAE2D  -996  FEMALE
6497082  E4A2C6A059BD27024843AEA8924ACA01    -3    MALE
6498675  91A193DBF9891D001A11C6ED9093F2F6    -1  FEMALE
6502709  C53E03E6794972AB45656853357AC65A    -5    MALE

[1143 rows x 3 columns]


In [66]:
# Replace ages below 0 with NaN
px_df.loc[px_df['age'] < 0, 'age'] = None
print(px_df[px_df['age'] < 0])

Empty DataFrame
Columns: [pxid, age, gender]
Index: []


In [84]:
# remove rows with pxid as null
px_df = px_df[px_df['pxid'].notnull()]

In [85]:
print(px_df.size)

19523436


## Doctors


In [68]:
doctors_df = pd.read_csv('../dataset/doctors.csv', encoding='latin1')

In [69]:
doctors_df.head(10)

Unnamed: 0,doctorid,mainspecialty,age
0,AD61AB143223EFBC24C7D2583BE69251,General Medicine,41.0
1,D09BF41544A3365A46C9077EBB5E35C3,Family Medicine,43.0
2,FBD7939D674997CDB4692D34DE8633C4,Vascular Medicine,26.0
3,28DD2C7955CE926456240B2FF0100BDE,Otolaryngologists,34.0
4,35F4A8D465E6E1EDC05F3D8AB658C551,General Dentistry,50.0
5,D1FE173D08E959397ADF34B1D77E88D7,Orthopedic,62.0
6,F033AB37C30201F73F142449D037028D,Family Medicine,41.0
7,43EC517D68B6EDD3015B3EDC9A11367B,Acupunturist1,38.0
8,9778D5D219C5080B9A6A17BEF029331C,Orthopaedic Sports Medicine,57.0
9,FE9FC289C3FF0AF142B6D3BEAD98A923,Masters of Science in Preventive & Regenerativ...,44.0


In [70]:
print(doctors_df.dtypes)

doctorid          object
mainspecialty     object
age              float64
dtype: object


In [71]:
doctors_df['age'] = pd.to_numeric(doctors_df['age'], errors='coerce')

# Convert 'age' column to integer, treating NaN as null
doctors_df['age'] = doctors_df['age'].replace({np.nan: None}).astype('Int64')

In [72]:
# remove rows with doctorid as null
doctors_df_df = doctors_df[doctors_df['doctorid'].notnull()]
print(px_df.size)

19523436


## Clinics

In [73]:
clinics_df = pd.read_csv('../dataset/clinics.csv', encoding='latin1')

In [74]:
print(clinics_df.dtypes)

clinicid        object
hospitalname    object
IsHospital        bool
City            object
Province        object
RegionName      object
dtype: object


In [75]:
# remove rows with clinicid as null
clinics_df = clinics_df[clinics_df['clinicid'].notnull()]


In [76]:
print(clinics_df.size)

323772


# Validating


In [77]:
# cross validation with other tables
#  Filter based on patient ID
valid_patient_appointments = appointment_df[appointment_df['pxid'].isin(px_df['pxid'])]

# Filter based on clinic ID
valid_clinic_appointments = valid_patient_appointments[valid_patient_appointments['clinicid'].isin(clinics_df['clinicid'])]

# Filter based on doctor ID
valid_appointments_df = valid_clinic_appointments[valid_clinic_appointments['doctorid'].isin(doctors_df['doctorid'])]

# Resetting the index is optional
valid_appointments_df.reset_index(drop=True, inplace=True)

appointment_df = valid_appointments_df

In [78]:
appointment_df.size

3521540

# Importing

In [79]:
appointment_df.to_sql('appointments', engine, if_exists='replace', index=False)

320140

In [80]:
px_df.to_sql('px', engine, if_exists='replace', index=False)

6507812

In [81]:
doctors_df.to_sql('doctors', engine, if_exists='replace', index=False)

60024

In [82]:
clinics_df.to_sql('clinics', engine, if_exists='replace', index=False)

53962