In [136]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pytz
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


# sets the theme of the charts
plt.style.use('seaborn-v0_8-darkgrid')

%matplotlib inline

In [35]:
# imports the csv files
px_df = pd.read_csv('px.csv', low_memory=False)
doctors_df = pd.read_csv('doctors.csv', low_memory=False, encoding='unicode_escape')
clinics_df = pd.read_csv('clinics.csv', low_memory=False, encoding='unicode_escape')
appointments_df = pd.read_csv('appointments.csv', low_memory=False, encoding='unicode_escape')

### `px dataset`
1. `pxid`: unique identifier assigned to each patient.
2. `age`: age of each patient at the time of record collection.
3. `gender`: gender of each patient.

In [36]:
# check info for px dataset
px_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6507813 entries, 0 to 6507812
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   pxid    object
 1   age     object
 2   gender  object
dtypes: object(3)
memory usage: 149.0+ MB


`1. pxid data cleaning`

In [37]:
# check for duplicate values
print("duplicate rows:", px_df['pxid'].duplicated().sum())

duplicate rows: 995330


In [38]:
duplicated_rows = px_df[px_df.duplicated()]
print(duplicated_rows)

                                     pxid  age  gender
995329   5A1718EC380AFE6BE24D63EE78CDA043  NaN  FEMALE
995330   C0FC9D6384C7F579F1048A461C298B8C    8  FEMALE
995331   FB0C4098E2F7FD0BC3865382242E7034    2  FEMALE
995332   EB3C1CBD5A2AC52C69BE0B90C5E149A7   54    MALE
995333   92C92C2EB4B51FBB3CD354165BA5F027   67  FEMALE
...                                   ...  ...     ...
1990653  FE9B107442671F180DFA265DFD316F46   64  FEMALE
1990654  1D1B685AF51983D89060BBE064BFF30E   50  FEMALE
1990655  B0A068EAD831F4E3D125D9693FA4EF70   34  FEMALE
1990656  4EDAEEA20EA10C4DB4876E737CEAD0DA    7  FEMALE
3959872  D8E7A835F6418A9A8ABA99A87390446D   38    MALE

[995329 rows x 3 columns]


In [39]:
# drop duplicate rows
px_df = px_df.drop_duplicates()

In [40]:
px_df['pxid']

0          5A1718EC380AFE6BE24D63EE78CDA043
1          C0FC9D6384C7F579F1048A461C298B8C
2          FB0C4098E2F7FD0BC3865382242E7034
3          EB3C1CBD5A2AC52C69BE0B90C5E149A7
4          92C92C2EB4B51FBB3CD354165BA5F027
                         ...               
6507808    9D8045364678C651016538B78784720E
6507809    BCC86927CB7F687624859D5B7C9AB8B6
6507810    2890460EE1FD505905A0A55834EBDC06
6507811    3989D63FA4DD7C54911AA085CB7CF38A
6507812    80E396F35668298FED90AD2278A6D28D
Name: pxid, Length: 5512484, dtype: object

In [41]:
# check for missing values
print("duplicate rows:", px_df.duplicated().sum())

# check for missing values
print("no. of missing values: ", px_df['pxid'].isnull().sum())

# check for non-string values
print("no. of non-string values: ", px_df['pxid'].apply(type).ne(str).sum())

duplicate rows: 0
no. of missing values:  0
no. of non-string values:  0


`2. age data cleaning`

In [42]:
# check values for age
px_df['age']

0          NaN
1            8
2            2
3           54
4           67
          ... 
6507808     56
6507809     60
6507810     17
6507811     62
6507812     46
Name: age, Length: 5512484, dtype: object

In [43]:
# check for missing values
print("no. of missing values: ", px_df['age'].isnull().sum())

# check for non-string values
print("no. of non-string values: ", px_df['age'].apply(type).ne(str).sum())

# check for values with special characters
print("no. of values with special characters: ", px_df['age'].str.contains(r'[^0-9]').sum())

no. of missing values:  9999
no. of non-string values:  9999
no. of values with special characters:  1004


In [44]:
# filter out rows with empty values
mask = px_df['age'].notna()

# find rows with special characters
rows_with_special_chars = px_df[mask & px_df['age'].str.contains(r'[^0-9]')]

print("Values with special characters in 'age' column:")
print(rows_with_special_chars['age'])

Values with special characters in 'age' column:
3224       -182
5883        -24
12133      -962
13506        -9
18165       -20
           ... 
6446114      -2
6483471    -996
6497083      -3
6498676      -1
6502710      -5
Name: age, Length: 1004, dtype: object


In [45]:
# converting negative values to null values
px_df.loc[rows_with_special_chars.index, 'age'] = np.nan
print("Number of null values:", px_df['age'].isnull().sum())

Number of null values: 11003


In [46]:
# converting dtype to float64
px_df['age'] = pd.to_numeric(px_df['age'], errors='coerce')

# Convert the 'age' column to float64
px_df['age'] = px_df['age'].astype(float)

# Display the data type of the 'age' column after conversion
print("Data type after conversion:", px_df['age'].dtype)

Data type after conversion: float64


In [47]:
px_df['age']

0           NaN
1           8.0
2           2.0
3          54.0
4          67.0
           ... 
6507808    56.0
6507809    60.0
6507810    17.0
6507811    62.0
6507812    46.0
Name: age, Length: 5512484, dtype: float64

In [48]:
# check for non-float values
print("no. of non-float values: ", px_df['age'].apply(type).ne(float).sum())

no. of non-float values:  0


In [49]:
# verify that negative values are converted to empty strings
print("Number of negative values after conversion:", (px_df['age'] < 0).sum())

Number of negative values after conversion: 0


In [50]:
# remove outliers (age > 100)
px_df = px_df[px_df['age'] <= 100]
px_df['age']

1           8.0
2           2.0
3          54.0
4          67.0
5          89.0
           ... 
6507808    56.0
6507809    60.0
6507810    17.0
6507811    62.0
6507812    46.0
Name: age, Length: 5487414, dtype: float64

In [51]:
# convert data type to integer
px_df['age'] = px_df['age'].astype(int)
px_df['age']

1           8
2           2
3          54
4          67
5          89
           ..
6507808    56
6507809    60
6507810    17
6507811    62
6507812    46
Name: age, Length: 5487414, dtype: int32

`3. gender data cleaning`

In [52]:
# check values for gender
px_df['gender']

1          FEMALE
2          FEMALE
3            MALE
4          FEMALE
5            MALE
            ...  
6507808      MALE
6507809      MALE
6507810    FEMALE
6507811      MALE
6507812    FEMALE
Name: gender, Length: 5487414, dtype: object

In [53]:
# check for missing values
print("no. of missing values: ", px_df['gender'].isnull().sum())

# check for non-string values
print("no. of non-string values: ", px_df['gender'].apply(type).ne(str).sum())

no. of missing values:  0
no. of non-string values:  0


In [54]:
# check for unique values
print(px_df['gender'].unique())

['FEMALE' 'MALE']


In [55]:
print(px_df[px_df['gender'] == 'gender'])

Empty DataFrame
Columns: [pxid, age, gender]
Index: []


In [56]:
# remove the duplicate title row from the dataframe
px_df = px_df[px_df['gender'] != 'gender']

# check for unique values
print(px_df['gender'].unique())

['FEMALE' 'MALE']


In [57]:
px_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5487414 entries, 1 to 6507812
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   pxid    object
 1   age     int32 
 2   gender  object
dtypes: int32(1), object(2)
memory usage: 146.5+ MB


In [58]:
px_df

Unnamed: 0,pxid,age,gender
1,C0FC9D6384C7F579F1048A461C298B8C,8,FEMALE
2,FB0C4098E2F7FD0BC3865382242E7034,2,FEMALE
3,EB3C1CBD5A2AC52C69BE0B90C5E149A7,54,MALE
4,92C92C2EB4B51FBB3CD354165BA5F027,67,FEMALE
5,F75E54013B29DF41766E2B1125EFC409,89,MALE
...,...,...,...
6507808,9D8045364678C651016538B78784720E,56,MALE
6507809,BCC86927CB7F687624859D5B7C9AB8B6,60,MALE
6507810,2890460EE1FD505905A0A55834EBDC06,17,FEMALE
6507811,3989D63FA4DD7C54911AA085CB7CF38A,62,MALE


In [59]:
# Save cleaned DataFrame to a CSV file
px_df.to_csv('px_cleaned.csv', index=False)

### `doctors dataset`
1. `doctorid`: unique identifiers for doctors
2. `mainspecialty`: main specialty of the doctors
3. `age`: age of the doctors

In [60]:
doctors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60024 entries, 0 to 60023
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   doctorid       60024 non-null  object 
 1   mainspecialty  27055 non-null  object 
 2   age            20028 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.4+ MB


`1. doctorid data cleaning`

In [61]:
doctors_df['doctorid']

0        AD61AB143223EFBC24C7D2583BE69251
1        D09BF41544A3365A46C9077EBB5E35C3
2        FBD7939D674997CDB4692D34DE8633C4
3        28DD2C7955CE926456240B2FF0100BDE
4        35F4A8D465E6E1EDC05F3D8AB658C551
                       ...               
60019    CD532DBEF6547A66D2138FAB49AA3B94
60020    4473D870B5E31FAA40D2C45E1FF6DC27
60021    A4F554EB2C0934E7FDE2511E8C1573BA
60022    E540A361D93D37A33BB2F55D43DA79D9
60023    23BA85862DD19C3550E7C0F0AF84C7ED
Name: doctorid, Length: 60024, dtype: object

In [62]:
# check for duplicates
unique_doctorid_count = doctors_df['doctorid'].nunique()

# check for missing values
missing_doctorid = doctors_df['doctorid'].isnull().sum()

# check if values are in capital letters
lowercase_doctorid = doctors_df['doctorid'].str.islower().sum()

# check if datatype is consistent
nonstring_doctorid = doctors_df['doctorid'].apply(type).ne(str).sum()

print("no. of unique doctor ids:", unique_doctorid_count)
print("no. of missing values:", missing_doctorid)
print("no. of lowercase values:",  lowercase_doctorid)
print("no. of non-string values:", nonstring_doctorid)


no. of unique doctor ids: 60024
no. of missing values: 0
no. of lowercase values: 0
no. of non-string values: 0


`2. mainspecialty data cleaning`

In [63]:
doctors_df['mainspecialty']

0            General Medicine
1             Family Medicine
2           Vascular Medicine
3           Otolaryngologists
4           General Dentistry
                 ...         
60019    General Practitioner
60020                     NaN
60021                     NaN
60022    General Practitioner
60023       Internal Medicine
Name: mainspecialty, Length: 60024, dtype: object

In [64]:
# check for unique values
unique_specialties = doctors_df['mainspecialty'].unique()
for specialty in unique_specialties:
    print(specialty)

General Medicine
Family Medicine
Vascular Medicine
Otolaryngologists
General Dentistry
Orthopedic
Acupunturist1
Orthopaedic Sports Medicine
Masters of Science in Preventive & Regenerative Medicine
Anti Aging and Regenerative Medicine
Surgery
Internal Medicine
Otolaryngology
Pediatrics
Ophthalmology
Orthopedic surgery
Orthopedic Surgeon
Internal medicine
General Surgery
Radiology
nan
Anesthesiology
Gastroenterology and Hepatology
General Physician
Obstetrics &Gynecology
HPB Surgery
PEDIATRICS
Orthopedics
Dermatology
Orthodontics
Family medicine
docmark_coli@yahoo.com
s
a
Pedia
Neurosurgery
Otolaryngology Head and Neck Surgery
Software Developer
Family and general dentistry
Otolaryngology-Head & Neck Surgery
Ob gyn
Obstetrics and Gynecology
Internal Medicine - Endocrinology
general pediatrics
internal medicine
Family dentistry
Pediatrician
Urology, Endoscopic Urology, Laparoscopic and Minimally Invasive Urology
Pediatric Dentistry
orthodontist 
Endocrinology
Orthodontist
Neurosurgery 
En

In [65]:
# duplicate the original dataframe
doctors_df_copy = doctors_df

# since fuzzywuzzy is slower the higher the number of rows, we split the dataset into three
split = np.array_split(doctors_df_copy, 3)
split_1, split_2, split_3, = split[0], split[1], split[2]

  return bound(*args, **kwds)


In [66]:
# step 1: define pecialties (sorted according to NowServing filters)
specialties = [
    "Primary Care", "General Medicine", "Emergency Medicine", "Family Medicine", "Urgent Care Medicine", "Internal Medicine",
    "Public Health", "Addiction Medicine", "Vascular Medicine", "Nursing", "Obstetrics And Gynecology (OBGYN)",
    "Reproductive Endocrinology and Infertility", "Ob Sonology", "Transgender Health And Wellness", "Pediatrics",
    "Diabetology", "Endocrinology", "Ophthalmology", "Optometry", "Cardiology", "Aesthetic, Reconstructive & Burn",
    "Dermatology", "Dermatopathology", "Pulmonology", "Respiratory Therapy", "Gastroenterology", "Bariatric Endoscopy",
    "Otolaryngology (ENT)", "Neurotology", "Nephrology", "Urology", "Neurology", "Neurosurgery", "Neuropsychology",
    "General Surgery", "Bariatric Surgery", "Cardiothoracic Surgery", "Colorectal Surgery", "Cosmetic Surgery",
    "Oral Surgery", "Hepato-Pancreatico-Biliary (HPB) Surgery", "Hepatobiliary Surgery", "Maxillofacial Surgery",
    "Oncology - Surgical", "Orthopedic Surgery", "Orthopedic Surgery - Foot & Ankle", "Orthopedics - Interventional",
    "Pancreatic Surgery", "Plastic Surgery", "Transplant Surgery", "Vascular Surgery", "Orthopedic Surgery - Spine",
    "Thoracic Cardiovascular Surgery", "Colorectal Surgery", "Veterinary Surgery",
    "Oncology", "Oncology - Medical", "Oncology - Radiation", "Oncology - Surgical", "Oncology - Trophoblastic Diseases",
    "Oncology - Gynecology", "Psychiatry", "Psychiatry - Child & Adolescent", "Psychiatry - Interventional",
    "Psychology", "Psychotherapy", "Guidance & Counseling", "Allergology & Immunology", "Infectious Disease",
    "Sleep Medicine", "Pain Medicine", "Physiatry", "Physical Medicine & Rehabilitation", "Sports Medicine",
    "Physical Therapy", "Physiotherapy", "Occupational Therapy", "Neonatology", "Perinatology", "Lactation Consultant",
    "Midwifery", "Hematology", "Hepatology", "Foot, Ankle, and Lower Leg", "Podiatry", "Joints, Muscles, and Bones",
    "Rheumatology", "Geriatric Medicine", "Gerontology", "Proctology", "Hearing", "Audiology", "Otology",
    "Speech-Language Pathology", "Hospice And Palliative Care", "Anesthesiology", "Imaging and Radiology", "Poisons and Toxins",
    "Toxicology", "General Dentistry", "Dentistry - Endodontics", "Dentistry - Oral Surgery", "Dentistry - Orthodontics",
    "Dentistry - Pediatrics", "Dentistry - Periodontics", "Dentistry - Prosthodontics", "Alternative Medicine", "Acupuncture",
    "Chiropractic", "Integrative Medicine", "Naturopathy", "Veterinary"
]

# step 2: perform matching via fuzzywuzzy
def match_specialty(value, choices, score_cutoff=80):
    match, score = process.extractOne(str(value), choices)
    if score > score_cutoff:
        return match
    return "NA"

# step 3: copy changes to new column
split_1['mainspecialty_cleaned'] = split_1['mainspecialty'].apply(lambda x: match_specialty(x, specialties))



In [67]:
# step 1: define pecialties (sorted according to NowServing filters)
specialties = [
    "Primary Care", "General Medicine", "Emergency Medicine", "Family Medicine", "Urgent Care Medicine", "Internal Medicine",
    "Public Health", "Addiction Medicine", "Vascular Medicine", "Nursing", "Obstetrics And Gynecology (OBGYN)",
    "Reproductive Endocrinology and Infertility", "Ob Sonology", "Transgender Health And Wellness", "Pediatrics",
    "Diabetology", "Endocrinology", "Ophthalmology", "Optometry", "Cardiology", "Aesthetic, Reconstructive & Burn",
    "Dermatology", "Dermatopathology", "Pulmonology", "Respiratory Therapy", "Gastroenterology", "Bariatric Endoscopy",
    "Otolaryngology (ENT)", "Neurotology", "Nephrology", "Urology", "Neurology", "Neurosurgery", "Neuropsychology",
    "General Surgery", "Bariatric Surgery", "Cardiothoracic Surgery", "Colorectal Surgery", "Cosmetic Surgery",
    "Oral Surgery", "Hepato-Pancreatico-Biliary (HPB) Surgery", "Hepatobiliary Surgery", "Maxillofacial Surgery",
    "Oncology - Surgical", "Orthopedic Surgery", "Orthopedic Surgery - Foot & Ankle", "Orthopedics - Interventional",
    "Pancreatic Surgery", "Plastic Surgery", "Transplant Surgery", "Vascular Surgery", "Orthopedic Surgery - Spine",
    "Thoracic Cardiovascular Surgery", "Colorectal Surgery", "Veterinary Surgery",
    "Oncology", "Oncology - Medical", "Oncology - Radiation", "Oncology - Surgical", "Oncology - Trophoblastic Diseases",
    "Oncology - Gynecology", "Psychiatry", "Psychiatry - Child & Adolescent", "Psychiatry - Interventional",
    "Psychology", "Psychotherapy", "Guidance & Counseling", "Allergology & Immunology", "Infectious Disease",
    "Sleep Medicine", "Pain Medicine", "Physiatry", "Physical Medicine & Rehabilitation", "Sports Medicine",
    "Physical Therapy", "Physiotherapy", "Occupational Therapy", "Neonatology", "Perinatology", "Lactation Consultant",
    "Midwifery", "Hematology", "Hepatology", "Foot, Ankle, and Lower Leg", "Podiatry", "Joints, Muscles, and Bones",
    "Rheumatology", "Geriatric Medicine", "Gerontology", "Proctology", "Hearing", "Audiology", "Otology",
    "Speech-Language Pathology", "Hospice And Palliative Care", "Anesthesiology", "Imaging and Radiology", "Poisons and Toxins",
    "Toxicology", "General Dentistry", "Dentistry - Endodontics", "Dentistry - Oral Surgery", "Dentistry - Orthodontics",
    "Dentistry - Pediatrics", "Dentistry - Periodontics", "Dentistry - Prosthodontics", "Alternative Medicine", "Acupuncture",
    "Chiropractic", "Integrative Medicine", "Naturopathy", "Veterinary"
]

# step 2: perform matching via fuzzywuzzy
def match_specialty(value, choices, score_cutoff=80):
    match, score = process.extractOne(str(value), choices)
    if score > score_cutoff:
        return match
    return "NA"

# step 3: copy changes to new column
split_2['mainspecialty_cleaned'] = split_2['mainspecialty'].apply(lambda x: match_specialty(x, specialties))



In [68]:
# step 1: define pecialties (sorted according to NowServing filters)
specialties = [
    "Primary Care", "General Medicine", "Emergency Medicine", "Family Medicine", "Urgent Care Medicine", "Internal Medicine",
    "Public Health", "Addiction Medicine", "Vascular Medicine", "Nursing", "Obstetrics And Gynecology (OBGYN)",
    "Reproductive Endocrinology and Infertility", "Ob Sonology", "Transgender Health And Wellness", "Pediatrics",
    "Diabetology", "Endocrinology", "Ophthalmology", "Optometry", "Cardiology", "Aesthetic, Reconstructive & Burn",
    "Dermatology", "Dermatopathology", "Pulmonology", "Respiratory Therapy", "Gastroenterology", "Bariatric Endoscopy",
    "Otolaryngology (ENT)", "Neurotology", "Nephrology", "Urology", "Neurology", "Neurosurgery", "Neuropsychology",
    "General Surgery", "Bariatric Surgery", "Cardiothoracic Surgery", "Colorectal Surgery", "Cosmetic Surgery",
    "Oral Surgery", "Hepato-Pancreatico-Biliary (HPB) Surgery", "Hepatobiliary Surgery", "Maxillofacial Surgery",
    "Oncology - Surgical", "Orthopedic Surgery", "Orthopedic Surgery - Foot & Ankle", "Orthopedics - Interventional",
    "Pancreatic Surgery", "Plastic Surgery", "Transplant Surgery", "Vascular Surgery", "Orthopedic Surgery - Spine",
    "Thoracic Cardiovascular Surgery", "Colorectal Surgery", "Veterinary Surgery",
    "Oncology", "Oncology - Medical", "Oncology - Radiation", "Oncology - Surgical", "Oncology - Trophoblastic Diseases",
    "Oncology - Gynecology", "Psychiatry", "Psychiatry - Child & Adolescent", "Psychiatry - Interventional",
    "Psychology", "Psychotherapy", "Guidance & Counseling", "Allergology & Immunology", "Infectious Disease",
    "Sleep Medicine", "Pain Medicine", "Physiatry", "Physical Medicine & Rehabilitation", "Sports Medicine",
    "Physical Therapy", "Physiotherapy", "Occupational Therapy", "Neonatology", "Perinatology", "Lactation Consultant",
    "Midwifery", "Hematology", "Hepatology", "Foot, Ankle, and Lower Leg", "Podiatry", "Joints, Muscles, and Bones",
    "Rheumatology", "Geriatric Medicine", "Gerontology", "Proctology", "Hearing", "Audiology", "Otology",
    "Speech-Language Pathology", "Hospice And Palliative Care", "Anesthesiology", "Imaging and Radiology", "Poisons and Toxins",
    "Toxicology", "General Dentistry", "Dentistry - Endodontics", "Dentistry - Oral Surgery", "Dentistry - Orthodontics",
    "Dentistry - Pediatrics", "Dentistry - Periodontics", "Dentistry - Prosthodontics", "Alternative Medicine", "Acupuncture",
    "Chiropractic", "Integrative Medicine", "Naturopathy", "Veterinary"
]

# step 2: perform matching via fuzzywuzzy
def match_specialty(value, choices, score_cutoff=80):
    match, score = process.extractOne(str(value), choices)
    if score > score_cutoff:
        return match
    return "NA"

# step 3: copy changes to new column
split_3['mainspecialty_cleaned'] = split_3['mainspecialty'].apply(lambda x: match_specialty(x, specialties))



In [69]:
# after cleaning the data per batch, concatenate the splits to a new dataframe
merge_df = pd.concat([split_1, split_2, split_3])

# copy the new row to the mainspecialty row in the dataframe copy
doctors_df_copy['mainspecialty'] = merge_df['mainspecialty_cleaned']

# drop the rows with NA mainspecialty
doctors_df_copy = doctors_df_copy[doctors_df_copy['mainspecialty'] != 'NA']

# check if rows are successfully dropped
print("Before cleaning:", len(doctors_df))
print("After cleaning:", len(doctors_df_copy))

# replace the copy to the original
doctors_df = doctors_df_copy

Before cleaning: 60024
After cleaning: 22508


`3. age data cleaning`

In [70]:
doctors_df['age']

0        41.0
1        43.0
2        26.0
4        50.0
5        62.0
         ... 
60011    34.0
60012    37.0
60017     NaN
60018    29.0
60023    38.0
Name: age, Length: 22508, dtype: float64

In [71]:
# check for missing values
missing_age = doctors_df['age'].isnull().sum()

# check for age range
min_age = min(doctors_df['age'])
max_age = max(doctors_df['age'])


print("no. of missing values:", missing_age)
print("minimum age:", min_age)
print('maximum age:', max_age)
print("no. of rows:", len(doctors_df))

no. of missing values: 5312
minimum age: 4.0
maximum age: 1048.0
no. of rows: 22508


In [72]:
doctors_df_age = doctors_df

# step 4: drop missing values
doctors_df_age.dropna(subset=['age'], inplace=True)
missing_age = doctors_df_age['age'].isnull().sum()

# step 5: convert data type to integer
doctors_df_age['age'] = doctors_df['age'].astype(int)   

# step 6: handle outliers (greater than 18, less than 100)
doctors_df_age = doctors_df_age[(doctors_df_age['age'] >= 18) & (doctors_df_age['age'] <= 100)]
min_age = min(doctors_df_age['age'])
max_age = max(doctors_df_age['age'])

print("no. of missing values:", missing_age)
print("minimum age:", min_age)
print('maximum age:', max_age)
print("no. of rows:", len(doctors_df_age))

no. of missing values: 0
minimum age: 21
maximum age: 92
no. of rows: 17171


In [73]:
# replace the copy to the original
doctors_df = doctors_df_age

# reset the index
doctors_df.reset_index(drop=True, inplace=True)

# export the cleaned dataframe to a new csv file
doctors_df.to_csv('doctors_cleaned.csv', index=False)

### `clinics dataset`
1. `clinicid`: unique indentifiers for the clinics
2. `hospitalname`: names of hospitals
3. `isHospital`: indicator if entity is hospital or not
4. `City`: names of the cities
5. `Province`: names of the provinces
6. `RegionName` names of the regions

In [74]:
clinics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53962 entries, 0 to 53961
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   clinicid      53962 non-null  object
 1   hospitalname  17538 non-null  object
 2   IsHospital    53962 non-null  bool  
 3   City          53962 non-null  object
 4   Province      53962 non-null  object
 5   RegionName    53962 non-null  object
dtypes: bool(1), object(5)
memory usage: 2.1+ MB


`1. clinicid data cleaning`

In [75]:
clinics_df['clinicid']

0        77EE3BC58CE560B86C2B59363281E914
1        98C39996BF1543E974747A2549B3107C
2        9AEADE7BEADA35C83D3B344FBAFE43B0
3        FDBD31F2027F20378B1A80125FC862DB
4        205C3608ECB984C1F5F5D2F52C934428
                       ...               
53957    8DE279A56DBCECE9F9FFC514A7D5A378
53958    55A64961C9AA4134016786AE7202682E
53959    075E464A7D15E6E5B9D8F8F5B5B16BB9
53960    01063BCF7624297FBB408495BCB62904
53961    5DA48026B54B6EEB6062817CAA7C30EA
Name: clinicid, Length: 53962, dtype: object

In [76]:
# check for duplicates
unique_clinicid_count = clinics_df['clinicid'].nunique()

# check for missing values
missing_clinicid = clinics_df['clinicid'].isnull().sum()

# check if values are in capital letters
lowercase_clinicid = clinics_df['clinicid'].str.islower().sum()

# check if datatype is consistent
nonstring_clinicid = clinics_df['clinicid'].apply(type).ne(str).sum()

print("no. of unique clinic ids:", unique_clinicid_count)
print("no. of missing values:", missing_clinicid)
print("no. of lowercase values:",  lowercase_clinicid)
print("no. of non-string values:", nonstring_clinicid)

no. of unique clinic ids: 53962
no. of missing values: 0
no. of lowercase values: 0
no. of non-string values: 0


`2. hospitalname data cleaning`

In [77]:
clinics_df['hospitalname']

0        St. Luke's Medical Center-Global City
1                 Our Lady of Lourdes Hospital
2                        Makati Medical Center
3               Cardinal Santos Medical Center
4                                          NaN
                         ...                  
53957                                      NaN
53958                                      NaN
53959                                      NaN
53960      Malabon Hospital and Medical Center
53961                                      NaN
Name: hospitalname, Length: 53962, dtype: object

In [78]:
# check for missing values
missing_hospitalname = clinics_df['hospitalname'].isnull().sum()

# check if datatype is consistent
nonstring_hospitalname = clinics_df['clinicid'].apply(type).ne(str).sum()

print("no. of missing values:", missing_hospitalname)
print("no. of non-string values:", nonstring_hospitalname)

no. of missing values: 36424
no. of non-string values: 0


In [79]:
# duplicate df
clinics_df_copy = clinics_df

In [80]:
# remove trailing whitespace
clinics_df_copy['hospitalname'] = clinics_df_copy['hospitalname'].str.strip()

# check total rows with special characters
special_characters = clinics_df_copy[clinics_df_copy['hospitalname'].str.contains(r'[^\x00-\x7F]+', na=False)]

# identify the special characters
special_characters['hospitalname'].unique()

array(['Unihealth-Parañaque Hospital and Medical Center',
       'MCU\x96Filemon Dionisio Tanchoco Medical Foundation Hospital',
       "Las Piñas Doctor's Hospital", "Parañaque Doctor's Hospital",
       'Las Piñas City Medical Center', 'Perpetual Help Hospital Biñan',
       'Ospital ng Biñan', 'Medical Center Parañaque, Inc.',
       'Ospital ng Parañaque', 'Biñan Doctors Hospital, Inc.',
       'Healthserv Los Baños Medical Center',
       'Los Baños Doctors Hospital, Inc.',
       'Las Piñas General Hospital and Satellite Trauma Center',
       'University Health Service-UPLB College, Los Baños',
       'Dasmariñas City Medical Center',
       'Alfredo E. Marañon, Sr. Memorial District Hospital',
       'Parañaque Community Hospital', 'Señor Sto. Niño Hospital',
       'M. Napeñas Multi-Specialty Hospital',
       'La Viña General Hospital, Inc.'], dtype=object)

In [81]:
# replace special characters with "n"
clinics_df_copy['hospitalname'] = clinics_df_copy['hospitalname'].replace({r'[^\x00-\x7F]+':'n'}, regex=True)

# identify the special characters
special_characters = clinics_df_copy[clinics_df_copy['hospitalname'].str.contains(r'[^\x00-\x7F]+', na=False)]
special_characters['hospitalname'].unique()

# remove , from values
clinics_df_copy['hospitalname'] = clinics_df_copy['hospitalname'].replace({r',':''}, regex=True)

# remove " from values
clinics_df_copy['hospitalname'] = clinics_df_copy['hospitalname'].replace({r'"':''}, regex=True)

In [82]:
clinics_df = clinics_df_copy

`3. ishospital data cleaning`

In [83]:
clinics_df['IsHospital']

0         True
1         True
2         True
3         True
4        False
         ...  
53957    False
53958    False
53959    False
53960     True
53961    False
Name: IsHospital, Length: 53962, dtype: bool

In [84]:
# check for missing values
missing_isHospital = clinics_df['IsHospital'].isnull().sum()

# check for data consistency
valid_values = [True, False]
invalid_values = clinics_df[~clinics_df['IsHospital'].isin(valid_values)]['IsHospital'].sum()

print("no. of missing values:", missing_isHospital)
print("no. of invalid values:", invalid_values)

no. of missing values: 0
no. of invalid values: 0


`4. city data cleaning`

In [85]:
clinics_df['City']

0          Taguig
1          Manila
2          Makati
3        San Juan
4          Burgos
           ...   
53957      Manila
53958      Manila
53959      Manila
53960     Malabon
53961     Malabon
Name: City, Length: 53962, dtype: object

In [86]:
clinics_df['City'].unique()

array(['Taguig', 'Manila', 'Makati', 'San Juan', 'Burgos', 'Butuan City',
       'Basco', 'Quezon City', 'Mandaue City', 'Santa Rosa City',
       'San Fernando City', 'Santiago City', 'Batangas City',
       'Iloilo City', 'Botolan', 'Las Piñas', 'Balabac', 'Pasig', 'Bilar',
       'Malabon', 'Caloocan', 'Muntinlupa', 'Candijay', 'San Nicolas',
       'Calape', 'Cebu City', 'Manito', 'Pasay', 'Cabanatuan City',
       'Antipolo City', 'Taytay', 'Cainta', 'Mandaluyong', 'Marikina',
       'San Leonardo', 'Dasmariñas City', 'Bacoor City', 'Malolos City',
       'Parañaque', 'Imus City', 'Silang', 'San Jose del Monte City',
       'San Mateo', 'Davao City', 'Kidapawan City', 'Tagbilaran City',
       'General Trias', 'Balamban', 'Bacolod City', 'Cagayan de Oro',
       'Guagua', 'Lubao', 'Zamboanga City', 'Rizal', 'Santa Cruz',
       'Asuncion', 'Tagum City', 'Cardona', 'Morong', 'Tanay', 'Teresa',
       'Meycauayan City', 'Santo Domingo', 'San Jose City',
       'Trece Martires City',

In [87]:
# check for missing values
missing_city = clinics_df['City'].isnull().sum()

# check number of values with special characters
special_char_count = clinics_df['City'].apply(lambda x: bool(re.search('[^a-zA-Z0-9\s]', x))).sum()

print("no. of missing values:", missing_city)
print("no. of values with special characters:", special_char_count)

no. of missing values: 0
no. of values with special characters: 2142


In [88]:
# to address values with special characters, we need to find out those special characters

# initialize a set to store unique city names with special characters
unique_cities_with_special_chars = set()

# iterate through each city and identify special characters
for city in clinics_df['City']:
    special_chars = re.findall(r'[^a-zA-Z0-9\s]', str(city))
    if special_chars:
        unique_cities_with_special_chars.add((city, ', '.join(special_chars)))

# print unique city names with their special characters
for city, special_chars in unique_cities_with_special_chars:
    print(f"City: {city}, Special Characters: {special_chars}")

City: Science City of Muñoz, Special Characters: ñ
City: Biñan City, Special Characters: ñ
City: Parañaque, Special Characters: ñ
City: Sanchez-Mira, Special Characters: -
City: Peñablanca, Special Characters: ñ
City: Dasmariñas City, Special Characters: ñ
City: M'lang, Special Characters: '
City: Los Baños, Special Characters: ñ
City: Libjo (Albor), Special Characters: (, )
City: Anini-y, Special Characters: -
City: Enrique B. Magalona, Special Characters: .
City: Las Piñas, Special Characters: ñ
City: Lapu-Lapu City, Special Characters: -
City: Brooke's Point, Special Characters: '
City: Al-Barka, Special Characters: -
City: Lal-lo, Special Characters: -


In [89]:
# since most of the values contain ñ, convert this to n

# create a duplicate of the dataset
clinics_df_copy = clinics_df
clinics_df_copy['City'] = clinics_df_copy['City'].str.replace('ñ', 'n')

# initialize a set to store unique city names with special characters
unique_cities_with_special_chars = set()

# iterate through each city and identify special characters
for city in clinics_df_copy['City']:
    special_chars = re.findall(r'[^a-zA-Z0-9\s]', str(city))
    if special_chars:
        unique_cities_with_special_chars.add((city, ', '.join(special_chars)))

# print unique city names with their special characters
for city, special_chars in unique_cities_with_special_chars:
    print(f"City: {city}, Special Characters: {special_chars}")
    
# copy duplicate to original dataset
clinics_df = clinics_df_copy

City: Sanchez-Mira, Special Characters: -
City: Anini-y, Special Characters: -
City: M'lang, Special Characters: '
City: Libjo (Albor), Special Characters: (, )
City: Enrique B. Magalona, Special Characters: .
City: Lapu-Lapu City, Special Characters: -
City: Brooke's Point, Special Characters: '
City: Al-Barka, Special Characters: -
City: Lal-lo, Special Characters: -


`5. province data cleaning`

In [90]:
clinics_df['Province'].unique()

array(['Manila', 'Ilocos Sur', 'Agusan del Norte', 'Batanes', 'Cebu',
       'Laguna', 'La Union', 'Isabela', 'Batangas', 'Iloilo', 'Zambales',
       'Palawan', 'Bohol', 'Ilocos Norte', 'Albay', 'Nueva Ecija',
       'Rizal', 'Cavite', 'Bulacan', 'Davao del Sur', 'Cotabato',
       'Negros Occidental', 'Misamis Oriental', 'Pampanga',
       'Zamboanga del Sur', 'Davao del Norte', 'Misamis Occidental',
       'Quezon', 'South Cotabato', 'Negros Oriental', 'Leyte',
       'Camarines Norte', 'Bukidnon', 'Benguet', 'Pangasinan',
       'Camarines Sur', 'Lanao del Norte', 'Oriental Mindoro',
       'Occidental Mindoro', 'Dinagat Islands', 'Davao Occidental',
       'Marinduque', 'Tarlac', 'Angeles', 'Lanao del Sur', 'Samar',
       'Surigao del Norte', 'Agusan del Sur', 'Mountain Province',
       'Nueva Vizcaya', 'Bataan', 'Eastern Samar', 'Aklan', 'Aurora',
       'Cagayan', 'Masbate', 'Kalinga', 'Sorsogon', 'Abra',
       'Davao Oriental', 'Surigao del Sur', 'Zamboanga Sibugay',
       

In [91]:
# check for missing values
missing_province = clinics_df['Province'].isnull().sum()

# check for check number of values with special characters
special_char_count_province = clinics_df['Province'].apply(lambda x: bool(re.search('[^a-zA-Z0-9\s]', x))).sum()

print("no. of missing values:", missing_province)
print("no. of values with special characters:", special_char_count_province)

no. of missing values: 0
no. of values with special characters: 0


`6. regionname data cleaning`

In [92]:
clinics_df['RegionName'].unique()

array(['National Capital Region (NCR)', 'Ilocos Region (I)',
       'Caraga (XIII)', 'Cagayan Valley (II)', 'Central Visayas (VII)',
       'CALABARZON (IV-A)', 'Western Visayas (VI)', 'Central Luzon (III)',
       'MIMAROPA (IV-B)', 'Bicol Region (V)', 'Davao Region (XI)',
       'SOCCSKSARGEN (Cotabato Region) (XII)', 'Northern Mindanao (X)',
       'Zamboanga Peninsula (IX)', 'Eastern Visayas (VIII)',
       'Cordillera Administrative Region (CAR)',
       'Bangsamoro Autonomous Region in Muslim Mindanao (BARMM)'],
      dtype=object)

In [93]:
# check for missing values
missing_region = clinics_df['RegionName'].isnull().sum()

# check for check number of values with special characters
special_char_count_region = clinics_df['Province'].apply(lambda x: bool(re.search('[^a-zA-Z0-9\s]', x))).sum()

print("no. of missing values:", missing_region)
print("no. of values with special characters:", special_char_count_region)

no. of missing values: 0
no. of values with special characters: 0


In [94]:
# check province per region
province_per_region = clinics_df.groupby('RegionName')['Province'].unique()

for region, provinces in province_per_region.items():
    print(f"Region: {region}")
    for province in provinces:
        print(f"- {province}")
    print()  # Add a newline between regions

Region: Bangsamoro Autonomous Region in Muslim Mindanao (BARMM)
- Lanao del Sur
- Maguindanao
- Sulu
- Basilan

Region: Bicol Region (V)
- Albay
- Camarines Norte
- Camarines Sur
- Masbate
- Sorsogon
- Catanduanes

Region: CALABARZON (IV-A)
- Laguna
- Batangas
- Rizal
- Cavite
- Quezon
- Angeles

Region: Cagayan Valley (II)
- Batanes
- Isabela
- Nueva Vizcaya
- Cagayan
- Albay
- Quirino

Region: Caraga (XIII)
- Agusan del Norte
- Dinagat Islands
- Surigao del Norte
- Agusan del Sur
- Surigao del Sur

Region: Central Luzon (III)
- Zambales
- Nueva Ecija
- Bulacan
- Pampanga
- Tarlac
- Bataan
- Aurora

Region: Central Visayas (VII)
- Cebu
- Bohol
- Negros Oriental
- Siquijor

Region: Cordillera Administrative Region (CAR)
- Benguet
- Mountain Province
- Kalinga
- Abra
- Apayao
- Ifugao

Region: Davao Region (XI)
- Davao del Sur
- Davao del Norte
- Davao Occidental
- Davao Oriental
- Compostela Valley
- Manila

Region: Eastern Visayas (VIII)
- Leyte
- Samar
- Eastern Samar
- Southern Leyt

In [95]:
# duplicate dataframe
clinics_df_copy_2 = clinics_df

In [96]:
# outliers
# CALABARZON - Angeles
# REGION II - Albay
# REGION III - insert Angeles
# REGION XI - Composela Valley, Manila
# NCR - Abra, Camarines Sur, Cavite

# Remove Angeles from CALABARZON and insert Angeles in REGION III
province_to_transfer = ['Angeles']
clinics_df_copy_2.loc[clinics_df_copy_2['Province'].isin(province_to_transfer), 'RegionName'] = 'Central Luzon (III)'

transfer_Albay = ['Albay']
clinics_df_copy_2.loc[clinics_df_copy_2['Province'].isin(transfer_Albay), 'RegionName'] = 'Bicol Region (V)'

transfer_Manila = ['Manila']
clinics_df_copy_2.loc[clinics_df_copy_2['Province'].isin(transfer_Manila), 'RegionName'] = 'National Capital Region (NCR)'

transfer_Abra = ['Abra']
clinics_df_copy_2.loc[clinics_df_copy_2['Province'].isin(transfer_Abra), 'RegionName'] = 'Cordillera Administrative Region (CAR)'

transfer_Camarines_Sur = ['Camarines Sur']
clinics_df_copy_2.loc[clinics_df_copy_2['Province'].isin(transfer_Camarines_Sur), 'RegionName'] = 'Bicol Region (V)'

transfer_Cavite = ['Cavite']
clinics_df_copy_2.loc[clinics_df_copy_2['Province'].isin(transfer_Cavite), 'RegionName'] = 'CALABARZON (IV-A)'
    
# change Compostela Valley to Davao de Oro
clinics_df_copy_2.loc[clinics_df_copy_2['Province'] == 'Compostela Valley'] = 'Davao de Oro'
clinics_df_copy_2.loc[clinics_df_copy_2['Province'] == 'Davao de Oro', 'RegionName'] = 'Davao Region (XI)'


# check changes
province_per_region_copy = clinics_df_copy_2.groupby('RegionName')['Province'].unique()

for region, provinces in province_per_region_copy.items():
    print(f"Region: {region}")
    for province in provinces:
        print(f"- {province}")
    print()  # Add a newline between regions

Region: Bangsamoro Autonomous Region in Muslim Mindanao (BARMM)
- Lanao del Sur
- Maguindanao
- Sulu
- Basilan

Region: Bicol Region (V)
- Albay
- Camarines Norte
- Camarines Sur
- Masbate
- Sorsogon
- Catanduanes

Region: CALABARZON (IV-A)
- Laguna
- Batangas
- Rizal
- Cavite
- Quezon

Region: Cagayan Valley (II)
- Batanes
- Isabela
- Nueva Vizcaya
- Cagayan
- Quirino

Region: Caraga (XIII)
- Agusan del Norte
- Dinagat Islands
- Surigao del Norte
- Agusan del Sur
- Surigao del Sur

Region: Central Luzon (III)
- Zambales
- Nueva Ecija
- Bulacan
- Pampanga
- Tarlac
- Angeles
- Bataan
- Aurora

Region: Central Visayas (VII)
- Cebu
- Bohol
- Negros Oriental
- Siquijor

Region: Cordillera Administrative Region (CAR)
- Benguet
- Mountain Province
- Kalinga
- Abra
- Apayao
- Ifugao

Region: Davao Region (XI)
- Davao del Sur
- Davao del Norte
- Davao Occidental
- Davao Oriental
- Davao de Oro

Region: Eastern Visayas (VIII)
- Leyte
- Samar
- Eastern Samar
- Southern Leyte
- Northern Samar
- B

  clinics_df_copy_2.loc[clinics_df_copy_2['Province'] == 'Compostela Valley'] = 'Davao de Oro'


In [97]:
clinics_df = clinics_df_copy_2

In [98]:
# export the cleaned dataset
clinics_df.to_csv('clinics_cleaned.csv', index=False)

### `appointments dataset`

1. `apptid`: unique identifier for each appointment
2. `pxid`: unique identifier for the patient associated with the appointment
3. `clinicid`: identifier for the clinic
4. `doctorid`: identifier for the doctor
5. `status`: status of appointment
6. `timequeued`: time when the appointment was added to the queue
7. `queuedate`: date when the appointment was added to the queue
8. `starttime`: scheduled start time for the appointment
9. `endtime`: scheduled end time for the appointment
10. `type`: type of appointment
11. `virtual`: indicates if appointment is virtual or in-person

In [99]:
final_appointments_df = appointments_df
final_appointments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9752932 entries, 0 to 9752931
Data columns (total 11 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   pxid        object
 1   clinicid    object
 2   doctorid    object
 3   apptid      object
 4   status      object
 5   TimeQueued  object
 6   QueueDate   object
 7   StartTime   object
 8   EndTime     object
 9   type        object
 10  Virtual     object
dtypes: object(11)
memory usage: 818.5+ MB


`1. apptid data cleaning`

In [101]:
# check values of apptid
appointments_df['apptid']

0          C1CC0949B93D00A559F7A0BD38361E80
1          6585A31C60A1886FBA1433C50012B504
2          7250DCFF615E6580295C7E6ED4322371
3          F5BBDCC08E39332F0AC27BB95CF1396A
4          55783FB42A02DABC3B2ED239B924DC87
                         ...               
9752927    215A26E2E5B831D87C41A5C1CC382B1C
9752928    E9EB60E3DA311FAE308A2DB53B6CCBAC
9752929    EFE2DDFD855E2B4E1C67956CE1CE8A6E
9752930    434410208F17F50143E1CE66E4CEF23F
9752931    62D462EFFD157E70C4301E555962D44B
Name: apptid, Length: 9752932, dtype: object

In [102]:
# check unique values of apptid
appointments_df['apptid'].unique()

array(['C1CC0949B93D00A559F7A0BD38361E80',
       '6585A31C60A1886FBA1433C50012B504',
       '7250DCFF615E6580295C7E6ED4322371', ...,
       'EFE2DDFD855E2B4E1C67956CE1CE8A6E',
       '434410208F17F50143E1CE66E4CEF23F',
       '62D462EFFD157E70C4301E555962D44B'], dtype=object)

In [103]:
# check value counts of apptid
appointments_df['apptid'].value_counts()

apptid
C1CC0949B93D00A559F7A0BD38361E80    1
F7C605ADA76D7F1CC8C6F74741B0C582    1
4E698A1E6BF55AF28BFC1BFD60F8A1EB    1
6D3C94EB38F1784D71350CED302D1F22    1
EF431FD37AEF6661C3B1412E2D9A13C8    1
                                   ..
B50B4D1703A101F81F86DAEFC0A3C784    1
C4AADFD59C168610BD7A465FA54F492B    1
7C87DAD8B177D7C0ABAC3573BC627301    1
F8AA2DDF109DB830D01A49808B18F7B6    1
62D462EFFD157E70C4301E555962D44B    1
Name: count, Length: 9752932, dtype: int64

In [104]:
# check null values of apptid
appointments_df['apptid'].isnull().sum()

0

`2. pxid data cleaning`

In [105]:
# check values of pxid
appointments_df['pxid']

0          EF196B348A49FB32DABC9834DC4FAAD9
1          EAE3C87D0B33351272F2E9B9B1B56217
2          7C5C93809D626CC702D08F33985B2B58
3          C300C2B9E0E5D4C46E8093BCDBFA05CA
4          B3DBE7F9E4DC33CBC5660E0A923CF8E8
                         ...               
9752927    2583E761CF4CAB4813AAEAFDAA883CC6
9752928    F51E6BF96EA5028AE5F5C01EBF08E3BD
9752929    84E3EB4A060096C3702D33F5A52E8B43
9752930    2B9F701BED6F68800637ADB7EF4CACE2
9752931    2ED01D09EF929AE3CA7564A2CB09DC2C
Name: pxid, Length: 9752932, dtype: object

In [106]:
# check unique values of pxid
appointments_df['pxid'].unique()

array(['EF196B348A49FB32DABC9834DC4FAAD9',
       'EAE3C87D0B33351272F2E9B9B1B56217',
       '7C5C93809D626CC702D08F33985B2B58', ...,
       '84E3EB4A060096C3702D33F5A52E8B43',
       '2B9F701BED6F68800637ADB7EF4CACE2',
       '2ED01D09EF929AE3CA7564A2CB09DC2C'], dtype=object)

In [107]:
# check value counts of pxid
appointments_df['pxid'].value_counts()

pxid
7CB50A8783C05F2788D7AE1EE79A4CF7    1249
0761C17912CDEF7AEF25D118E1936D9A     590
B4A945E181DE1FFB82D1D18CA9A51687     583
B2B657D3C301FFCBFDA0B2ED17E7A5EB     516
A2536B80CAD05E14355410332513D550     374
                                    ... 
8AFAF290232A5A2738924D93DEE9DCC7       1
31E474891F3CB25A7E5A386E04549375       1
CF29CECFA79A3A9C93DF553B7223F4B7       1
ABD0DCF0DD4D2581718EF6C945053FBE       1
2ED01D09EF929AE3CA7564A2CB09DC2C       1
Name: count, Length: 3946064, dtype: int64

In [108]:
# check for null values of pxid
pxid_count = appointments_df['pxid'].isnull().sum()

`3. clinicid data cleaning`

In [109]:
# check values of clinicid
appointments_df['clinicid']

0          ADF7EE2DCF142B0E11888E72B43FCB75
1          1E0F65EB20ACBFB27EE05DDC000B50EC
2          1E0F65EB20ACBFB27EE05DDC000B50EC
3          98C39996BF1543E974747A2549B3107C
4          77EE3BC58CE560B86C2B59363281E914
                         ...               
9752927    CCFC2D538DDFF519D893A6B966A1C4F1
9752928    CCFC2D538DDFF519D893A6B966A1C4F1
9752929    CCFC2D538DDFF519D893A6B966A1C4F1
9752930    CCFC2D538DDFF519D893A6B966A1C4F1
9752931    40F4775D64533EE66E3E20AE64228661
Name: clinicid, Length: 9752932, dtype: object

In [110]:
# check unique values of clinicid
appointments_df['clinicid'].unique()

array(['ADF7EE2DCF142B0E11888E72B43FCB75',
       '1E0F65EB20ACBFB27EE05DDC000B50EC',
       '98C39996BF1543E974747A2549B3107C', ...,
       '772C1C73D4FFE88046A3CDB8772E53EB',
       'CCFC2D538DDFF519D893A6B966A1C4F1',
       '40F4775D64533EE66E3E20AE64228661'], dtype=object)

In [111]:
# check value counts of clinicid
appointments_df['clinicid'].value_counts()

clinicid
7522A10DDF6916ABCCF0163B58CA0543    109517
6467C327EAF8940B4DD07A08C63C5E85    102465
D7FD83EAF0A5593A190E8E9C8D7ECE84     72096
F2B4053221961416D47D497814A8064F     70546
A7BF3F5462CC82062E41B3A2262E1A21     57308
                                     ...  
5B92E4945EB3A04990671A4DA604FF17         1
933BD65AC164FA84EBEF0C17AE189B43         1
966BC24F56AB8397AB2303E8E4CDB4C7         1
86A2587A85BE5C0CC7E8D02F82DF6CC1         1
40F4775D64533EE66E3E20AE64228661         1
Name: count, Length: 25262, dtype: int64

In [112]:
# check for null values of clinicid
appointments_df['clinicid'].isnull().sum()

0

`4. doctorid data cleaning`

In [113]:
# check values of doctorid
appointments_df['doctorid']

0          BB04AF0F7ECAEE4AAE62035497DA1387
1          82AA4B0AF34C2313A562076992E50AA3
2          82AA4B0AF34C2313A562076992E50AA3
3          AD61AB143223EFBC24C7D2583BE69251
4          AD61AB143223EFBC24C7D2583BE69251
                         ...               
9752927    B706835DE79A2B4E80506F582AF3676A
9752928    B706835DE79A2B4E80506F582AF3676A
9752929    B706835DE79A2B4E80506F582AF3676A
9752930    B706835DE79A2B4E80506F582AF3676A
9752931    B706835DE79A2B4E80506F582AF3676A
Name: doctorid, Length: 9752932, dtype: object

In [114]:
# check unique values of doctorid
appointments_df['doctorid'].unique()

array(['BB04AF0F7ECAEE4AAE62035497DA1387',
       '82AA4B0AF34C2313A562076992E50AA3',
       'AD61AB143223EFBC24C7D2583BE69251', ...,
       '69B4FA3BE19BDF400DF34E41B93636A4',
       '4FC8ED929E539525E3590F1607718F97',
       'B706835DE79A2B4E80506F582AF3676A'], dtype=object)

In [115]:
# check value counts of doctorid
appointments_df['doctorid'].value_counts()

doctorid
3349958A3E56580D4E415DA345703886    238997
4F05D4821FE9967817DEA5A20C4E7B35    133513
95177E528F8D6C7C28A5473FD5A471B6    106222
9F93557D309F655FF06F109A08DCF7C4     72596
852C296DFA59522F563AEF29D8D0ADF6     72453
                                     ...  
ADF7E293599134777339FDC40DDFA818         1
32B127307A606EFFDCC8E51F60A45922         1
9903F53EB5F12F84F496530B0DC2526C         1
6FB993285D56E6927284FF9B11AC6851         1
A381C2C35C9157F6B67FD07D5A200AE1         1
Name: count, Length: 10832, dtype: int64

In [116]:
# check null values of doctorid
appointments_df['doctorid'].isnull().sum()

0

`5. status data cleaning`

In [117]:
# check values of status
appointments_df['status']

0          Complete
1            Queued
2            Queued
3            Queued
4            Queued
             ...   
9752927      Queued
9752928      Queued
9752929      Queued
9752930      Queued
9752931      Queued
Name: status, Length: 9752932, dtype: object

In [118]:
# check unique values of status
appointments_df['status'].unique()

array(['Complete', 'Queued', 'NoShow', 'Serving', 'Cancel', 'Skip',
       'Completed', 'Admitted'], dtype=object)

In [119]:
# check count for status since there are two values for 'Complete'
appointments_df['status'].value_counts()

status
Complete     6473675
Queued       2523855
Serving       284774
NoShow        230048
Cancel        213270
Skip           27298
Completed         10
Admitted           2
Name: count, dtype: int64

In [120]:
# replace Completed with Complete
appointments_df['status'] = appointments_df['status'].replace({'Completed': 'Complete'}, regex=True)
status_vc = appointments_df['status'].value_counts()
print(status_vc)

status
Complete    6473685
Queued      2523855
Serving      284774
NoShow       230048
Cancel       213270
Skip          27298
Admitted          2
Name: count, dtype: int64


In [121]:
# check null values of status
appointments_df['status'].isnull().sum()

0

`6. queuedate data cleaning`

In [122]:
# check values of QueueDate
appointments_df['QueueDate']

0          2018-04-10 16:00:00
1          2018-04-08 16:00:00
2          2018-03-30 16:00:00
3                          NaN
4                          NaN
                  ...         
9752927    2023-11-22 16:00:00
9752928    2023-12-04 16:00:00
9752929    2024-01-03 16:00:00
9752930    2024-01-12 16:00:00
9752931    2024-01-03 16:00:00
Name: QueueDate, Length: 9752932, dtype: object

In [123]:
# convert to datetime data type
appointments_df['QueueDate'] = pd.to_datetime(appointments_df['QueueDate'], format='mixed', errors='coerce')

In [124]:
appointments_df['QueueDate'] = appointments_df['QueueDate'].dt.floor('s')

In [125]:
# check null values of QueueDate
appointments_df['QueueDate'].isnull().sum()

99

In [126]:
# drop rows that have invalid Queuedate
appointments_df = appointments_df.dropna(subset=['QueueDate'])

In [137]:
# Set the timezone to GMT+8 (Asia/Singapore)
asian_tz = pytz.timezone('Asia/Singapore')
# Convert the 'QueueDate' column to the new timezone
appointments_df['QueueDate'] = appointments_df['QueueDate'].dt.tz_localize('UTC').dt.tz_convert(asian_tz)

`7. timequeued data cleaning`

In [138]:
# check values of TimeQueued
appointments_df['TimeQueued']

1                    2018-04-10 10:34:16
2                    2018-04-03 15:26:19
29         2021-02-03 04:57:56.760000000
32         2021-05-04 04:45:32.427000000
34         2021-05-19 04:10:19.667000000
                       ...              
9752921    2023-11-11 06:51:52.857000000
9752925    2023-11-14 02:32:27.507000000
9752929    2024-01-04 06:11:06.437000000
9752930    2024-01-13 06:12:54.737000000
9752931    2024-01-04 00:45:05.063000000
Name: TimeQueued, Length: 1888235, dtype: object

In [139]:
# convert to datetime data type
appointments_df['TimeQueued'] = pd.to_datetime(appointments_df['TimeQueued'], format='mixed', errors='coerce')

In [140]:
appointments_df['TimeQueued'] = appointments_df['TimeQueued'].dt.floor('s')

In [141]:
# check null values of TimeQueued
appointments_df['TimeQueued'].isnull().sum()

0

In [142]:
# drop rows that have invalid TimeQueued
appointments_df = appointments_df.dropna(subset=['TimeQueued'])

In [143]:
# Set the timezone to GMT+8 (Asia/Singapore)
asian_tz = pytz.timezone('Asia/Singapore')

# Convert the 'QueueDate' column to the new timezone
appointments_df['TimeQueued'] = appointments_df['TimeQueued'].dt.tz_localize('UTC').dt.tz_convert(asian_tz)

In [144]:
len(appointments_df.loc[appointments_df['TimeQueued'] < appointments_df['QueueDate'], 'TimeQueued'])

0

In [145]:
# drop rows whose timequeued is earlier than Queue Date
appointments_df.loc[appointments_df['TimeQueued'] < appointments_df['QueueDate'], 'TimeQueued'] = pd.NaT
appointments_df = appointments_df.dropna(subset=['TimeQueued'])

`8. starttime data cleaning`

In [146]:
# check values of StartTime
appointments_df['StartTime']

1         2018-04-09 10:33:00
2         2018-03-31 15:25:00
29        2021-02-03 05:30:00
32        2021-05-04 11:00:00
34        2021-05-19 05:15:00
                  ...        
9752921   2023-11-11 07:30:00
9752925   2023-11-14 08:15:00
9752929   2024-01-04 08:00:00
9752930   2024-01-13 08:00:00
9752931   2024-01-04 02:30:00
Name: StartTime, Length: 1888235, dtype: datetime64[ns]

In [147]:
# convert to datetime data type
appointments_df['StartTime'] = pd.to_datetime(appointments_df['StartTime'], format='mixed', errors='coerce')

In [148]:
appointments_df['StartTime'] = appointments_df['StartTime'].dt.floor('s')

In [149]:
# check null values of StartTime
appointments_df['StartTime'].isnull().sum()

0

In [150]:
# drop rows that have invalid TimeQueued
appointments_df = appointments_df.dropna(subset=['StartTime'])

In [151]:
# Set the timezone to GMT+8 (Asia/Singapore)
asian_tz = pytz.timezone('Asia/Singapore')

# Convert the 'QueueDate' column to the new timezone
appointments_df['StartTime'] = appointments_df['StartTime'].dt.tz_localize('UTC').dt.tz_convert(asian_tz)

In [152]:
# drop rows whose StartTime is earlier than TimeQueued
appointments_df.loc[appointments_df['StartTime'] < appointments_df['TimeQueued'], 'StartTime'] = pd.NaT
appointments_df = appointments_df.dropna(subset=['StartTime'])

`9. endtime data cleaning`

In [153]:
# check values of EndTime
appointments_df['EndTime']

29         2021-02-03 05:45:00
32         2021-05-04 11:15:00
34         2021-05-19 05:30:00
35         2021-05-19 05:15:00
36         2021-06-02 11:15:00
                  ...         
9752921    2023-11-11 07:45:00
9752925    2023-11-14 08:30:00
9752929    2024-01-04 08:15:00
9752930    2024-01-13 08:15:00
9752931    2024-01-04 02:45:00
Name: EndTime, Length: 1247946, dtype: object

In [154]:
# convert to datetime data type
appointments_df['EndTime'] = pd.to_datetime(appointments_df['EndTime'], format='mixed', errors='coerce')

In [155]:
appointments_df['EndTime'] = appointments_df['EndTime'].dt.floor('s')

In [156]:
# check null values of EndTime
appointments_df['EndTime'].isnull().sum()

149353

In [157]:
# drop rows that have invalid TimeQueued
appointments_df = appointments_df.dropna(subset=['EndTime'])

In [158]:
# Set the timezone to GMT+8 (Asia/Singapore)
asian_tz = pytz.timezone('Asia/Singapore')

# Convert the 'QueueDate' column to the new timezone
appointments_df['EndTime'] = appointments_df['EndTime'].dt.tz_localize('UTC').dt.tz_convert(asian_tz)

In [159]:
# drop rows whose EndTime is earlier than StartTime
appointments_df['InvalidDuration'] = np.where(appointments_df['EndTime'] < appointments_df['StartTime'], 'Invalid', 'ok')
appointments_df = appointments_df[appointments_df['InvalidDuration'] != 'Invalid']
appointments_df = appointments_df.drop(columns=['InvalidDuration'])

In [160]:
#drop rows where status is complete but there is no endtime
mask = (appointments_df['status'] == 'Complete') & (appointments_df['EndTime'].isna())
appointments_df = appointments_df[~mask]

In [161]:
# drop rows where type is consultation and there is no end time
mask = (appointments_df['type'] == 'Consultation') & (appointments_df['EndTime'].isna())
appointments_df = appointments_df[~mask]

`10. type data cleaning`

In [162]:
# check values of type
appointments_df['type']

29         Consultation
32         Consultation
34         Consultation
35         Consultation
36         Consultation
               ...     
9752921    Consultation
9752925    Consultation
9752929    Consultation
9752930    Consultation
9752931    Consultation
Name: type, Length: 1098153, dtype: object

In [163]:
# check unique values of type
appointments_df['type'].unique()

array(['Consultation', 'Inpatient'], dtype=object)

In [164]:
type_vc = appointments_df['type'].value_counts()
print(type_vc)

type
Consultation    1094548
Inpatient          3605
Name: count, dtype: int64


In [165]:
# check null values of type
appointments_df['type'].isnull().sum()

0

`11. virtual data cleaning`

In [166]:
# check values of Virtual
appointments_df['Virtual']

29          True
32          True
34         False
35         False
36          True
           ...  
9752921    False
9752925    False
9752929    False
9752930    False
9752931    False
Name: Virtual, Length: 1098153, dtype: object

In [167]:
# check unique values of Virtual
appointments_df['Virtual'].unique()

array([True, False, nan], dtype=object)

In [168]:
virtual_vc = appointments_df['Virtual'].value_counts()
print(virtual_vc)

Virtual
False    549344
True     459630
Name: count, dtype: int64


In [169]:
# check null values of Virtual
appointments_df['Virtual'].isnull().sum()

89179

In [170]:
# all inpatient appointments are set to False
appointments_df.loc[appointments_df['type'] == 'Inpatient', 'Virtual'] = False

In [171]:
# drop all rows where Virtual == NaN
appointments_df = appointments_df.dropna(subset=['Virtual'])