In [5]:
import pandas as pd
from sqlalchemy import create_engine

#  Change 'root' to the name of your user; '12345' to the password of your connection, 'seriousmd' to the name of your schema
engine = create_engine('mysql://root:12345@localhost/seriousmd')

# Data Cleaning
Please put each data cleaning method you used below for each table



## Appointments




In [6]:
appointment_df = pd.read_csv('../dataset/appointments.csv', encoding='latin1')

In [7]:
# firstly, check the data type of the columns
print(appointment_df.dtypes)

pxid          object
clinicid      object
doctorid      object
apptid        object
status        object
TimeQueued    object
QueueDate     object
StartTime     object
EndTime       object
type          object
Virtual       object
dtype: object


In [8]:
# convert columns to their respective data types
appointment_df['pxid'] = appointment_df['pxid'].astype(str)
appointment_df['clinicid'] = appointment_df['clinicid'].astype(str)
appointment_df['doctorid'] = appointment_df['doctorid'].astype(str)
appointment_df['apptid'] = appointment_df['apptid'].astype(str)
appointment_df['status'] = appointment_df['status'].astype(str)
appointment_df['TimeQueued'] = pd.to_datetime(appointment_df['TimeQueued'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
appointment_df['QueueDate'] = pd.to_datetime(appointment_df['QueueDate'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
appointment_df['StartTime'] = pd.to_datetime(appointment_df['StartTime'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
appointment_df['EndTime'] = pd.to_datetime(appointment_df['EndTime'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
appointment_df['type'] = appointment_df['type'].astype(str)
appointment_df['Virtual'] = pd.to_numeric(appointment_df['Virtual'], errors='coerce').astype('boolean')

In [9]:
# rename column virtual to isVirtual
appointment_df.rename(columns={'Virtual': 'isVirtual'}, inplace=True)

In [10]:
print(appointment_df)

                                     pxid                          clinicid  \
0        EF196B348A49FB32DABC9834DC4FAAD9  ADF7EE2DCF142B0E11888E72B43FCB75   
1        EAE3C87D0B33351272F2E9B9B1B56217  1E0F65EB20ACBFB27EE05DDC000B50EC   
2        7C5C93809D626CC702D08F33985B2B58  1E0F65EB20ACBFB27EE05DDC000B50EC   
3        C300C2B9E0E5D4C46E8093BCDBFA05CA  98C39996BF1543E974747A2549B3107C   
4        B3DBE7F9E4DC33CBC5660E0A923CF8E8  77EE3BC58CE560B86C2B59363281E914   
...                                   ...                               ...   
9752927  2583E761CF4CAB4813AAEAFDAA883CC6  CCFC2D538DDFF519D893A6B966A1C4F1   
9752928  F51E6BF96EA5028AE5F5C01EBF08E3BD  CCFC2D538DDFF519D893A6B966A1C4F1   
9752929  84E3EB4A060096C3702D33F5A52E8B43  CCFC2D538DDFF519D893A6B966A1C4F1   
9752930  2B9F701BED6F68800637ADB7EF4CACE2  CCFC2D538DDFF519D893A6B966A1C4F1   
9752931  2ED01D09EF929AE3CA7564A2CB09DC2C  40F4775D64533EE66E3E20AE64228661   

                                 doctorid          

In [None]:
appointment_df.to_sql('appointments', engine, if_exists='replace', index=False)

9752932

## Patient (px)

In [11]:
px_df = pd.read_csv('../dataset/px.csv', encoding='latin1', low_memory=False)

In [12]:
print(px_df.dtypes)

pxid      object
age       object
gender    object
dtype: object


In [13]:
px_df['pxid'] = px_df['pxid'].astype(str)
px_df['age'] = pd.to_numeric(px_df['age'], errors='coerce').astype('Int64')
px_df['gender'] = px_df['gender'].astype(str)

In [14]:
# 1. delete row where data is pxid, age, gender
result_df = px_df[px_df['pxid'] == 'pxid']

# Display the result

print(result_df)

        pxid   age  gender
995328  pxid  <NA>  gender


In [15]:
# Create a boolean mask for the rows to be deleted
mask = (px_df['pxid'] == 'pxid')

# Use the boolean mask to drop the rows that meet the conditions
px_df.drop(px_df[mask].index, inplace=True)

# Optionally, you can reset the index after dropping rows
px_df.reset_index(drop=True, inplace=True)

In [16]:
result_df = px_df[px_df['pxid'] == 'pxid']

# Display the result
print(result_df)

Empty DataFrame
Columns: [pxid, age, gender]
Index: []


In [17]:
# 2. turn all ages below 0 into null 
print(px_df[px_df['age'] < 0])

                                     pxid   age  gender
3224     EC7168F4DF42E718CA4A70F52E57A99B  -182    MALE
5883     8CC44C76FDAAC6C6F63BCFFA7D6D035B   -24  FEMALE
12133    C1CA856AD536A5271D627B1C2D3035E5  -962  FEMALE
13506    4C6D650B3DF986431FB3E8E73B25E71B    -9    MALE
18165    D896D4A82EFD3D4B3DCFCFA68F1B4178   -20  FEMALE
...                                   ...   ...     ...
6446113  B55530B08612FCE658AB4FFE4FD339B4    -2  FEMALE
6483470  AB033EC5325213D763D23F08DFBCAE2D  -996  FEMALE
6497082  E4A2C6A059BD27024843AEA8924ACA01    -3    MALE
6498675  91A193DBF9891D001A11C6ED9093F2F6    -1  FEMALE
6502709  C53E03E6794972AB45656853357AC65A    -5    MALE

[1143 rows x 3 columns]


In [19]:
# Replace ages below 0 with NaN
px_df.loc[px_df['age'] < 0, 'age'] = None
print(px_df[px_df['age'] < 0])

Empty DataFrame
Columns: [pxid, age, gender]
Index: []


In [None]:
px_df.to_sql('px', engine, if_exists='replace', index=False)

6507812

## Doctors


In [None]:
doctors_df = pd.read_csv('../dataset/doctors.csv', encoding='latin1')
print(px_df.dtypes)

pxid      object
age        Int64
gender    object
dtype: object


In [None]:
doctors_df.to_sql('doctors', engine, if_exists='replace', index=False)

## Clinics

In [None]:
clinics_df = pd.read_csv('../dataset/clinics.csv', encoding='latin1')
print(clinics_df.dtypes)

clinicid        object
hospitalname    object
IsHospital        bool
City            object
Province        object
RegionName      object
dtype: object


In [None]:
clinics_df.to_sql('clinics', engine, if_exists='replace', index=False)