In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# sets the theme of the charts
plt.style.use('seaborn-v0_8-darkgrid')

%matplotlib inline

In [2]:
# imports the csv files
px_df = pd.read_csv('px.csv', low_memory=False)
doctors_df = pd.read_csv('doctors.csv', low_memory=False, encoding='unicode_escape')
clinics_df = pd.read_csv('clinics.csv', low_memory=False, encoding='unicode_escape')
appointments_df = pd.read_csv('appointments.csv', low_memory=False, encoding='unicode_escape')

In [3]:
# checks the content of the csv files
px_df.info()
doctors_df.info()
clinics_df.info()
appointments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6507813 entries, 0 to 6507812
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   pxid    object
 1   age     object
 2   gender  object
dtypes: object(3)
memory usage: 149.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60024 entries, 0 to 60023
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   doctorid       60024 non-null  object 
 1   mainspecialty  27055 non-null  object 
 2   age            20028 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53962 entries, 0 to 53961
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   clinicid      53962 non-null  object
 1   hospitalname  17538 non-null  object
 2   IsHospital    53962 non-null  bool  
 3   City          53962 non-null  object
 4   P

In [4]:
clinics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53962 entries, 0 to 53961
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   clinicid      53962 non-null  object
 1   hospitalname  17538 non-null  object
 2   IsHospital    53962 non-null  bool  
 3   City          53962 non-null  object
 4   Province      53962 non-null  object
 5   RegionName    53962 non-null  object
dtypes: bool(1), object(5)
memory usage: 2.1+ MB


## Clinics Dataset
1. `clinicid`: unique indentifiers for the clinics (no missing values)
2. `hospitalname`: names of hospitals (with missing values)
3. `isHospital`: indicator if entity is hospital or not (no missing values)
4. `City`: names of the cities (no missing values)
5. `Province`: names of the provinces (no missing values)
6. `RegionName` names of the regions (no missing values)

`clinicid` data cleaning

In [5]:
clinics_df['clinicid']

0        77EE3BC58CE560B86C2B59363281E914
1        98C39996BF1543E974747A2549B3107C
2        9AEADE7BEADA35C83D3B344FBAFE43B0
3        FDBD31F2027F20378B1A80125FC862DB
4        205C3608ECB984C1F5F5D2F52C934428
                       ...               
53957    8DE279A56DBCECE9F9FFC514A7D5A378
53958    55A64961C9AA4134016786AE7202682E
53959    075E464A7D15E6E5B9D8F8F5B5B16BB9
53960    01063BCF7624297FBB408495BCB62904
53961    5DA48026B54B6EEB6062817CAA7C30EA
Name: clinicid, Length: 53962, dtype: object

In [6]:
# step 1: check for duplicates
unique_clinicid_count = clinics_df['clinicid'].nunique()

# step 2: check for missing values
missing_clinicid = clinics_df['clinicid'].isnull().sum()

# step 3: check if values are in capital letters
lowercase_clinicid = clinics_df['clinicid'].str.islower().sum()

# step 4: check if datatype is consistent
nonstring_clinicid = clinics_df['clinicid'].apply(type).ne(str).sum()

print("no. of unique clinic ids:", unique_clinicid_count)
print("no. of missing values:", missing_clinicid)
print("no. of lowercase values:",  lowercase_clinicid)
print("no. of non-string values:", nonstring_clinicid)

no. of unique clinic ids: 53962
no. of missing values: 0
no. of lowercase values: 0
no. of non-string values: 0


`hospitalname` data cleaning

In [7]:
clinics_df['hospitalname']

0        St. Luke's Medical Center-Global City
1                 Our Lady of Lourdes Hospital
2                        Makati Medical Center
3               Cardinal Santos Medical Center
4                                          NaN
                         ...                  
53957                                      NaN
53958                                      NaN
53959                                      NaN
53960      Malabon Hospital and Medical Center
53961                                      NaN
Name: hospitalname, Length: 53962, dtype: object

In [8]:
# step 1: check for missing values
missing_hospitalname = clinics_df['hospitalname'].isnull().sum()

# step 2: check if datatype is consistent
nonstring_hospitalname = clinics_df['clinicid'].apply(type).ne(str).sum()

print("no. of missing values:", missing_hospitalname)
print("no. of non-string values:", nonstring_hospitalname)

no. of missing values: 36424
no. of non-string values: 0


In [9]:
# address missing values by replacing null with "N/A"
clinics_df_copy = clinics_df
clinics_df_copy['hospitalname'].fillna("N/A", inplace=True)

# check for missing values
missing_hospitalname_copy = clinics_df_copy['hospitalname'].isnull().sum()
print("no. of missing values (before):", missing_hospitalname)
print("no. of missing values (after):", missing_hospitalname_copy)

# copy back the duplicate
clinics_df = clinics_df_copy
missing_hospitalname_final = clinics_df['hospitalname'].isnull().sum()
print("no. of missing values (final):", missing_hospitalname_copy)


no. of missing values (before): 36424
no. of missing values (after): 0
no. of missing values (final): 0


`isHospital` data cleaning

In [10]:
clinics_df['IsHospital']

0         True
1         True
2         True
3         True
4        False
         ...  
53957    False
53958    False
53959    False
53960     True
53961    False
Name: IsHospital, Length: 53962, dtype: bool

In [11]:
# step 1: check for missing values
missing_isHospital = clinics_df['IsHospital'].isnull().sum()

# step 2: check for data consistency
valid_values = [True, False]
invalid_values = clinics_df[~clinics_df['IsHospital'].isin(valid_values)]['IsHospital'].sum()

print("no. of missing values:", missing_isHospital)
print("no. of invalid values:", invalid_values)

no. of missing values: 0
no. of invalid values: 0


`City` data cleaning

In [12]:
clinics_df['City']

0          Taguig
1          Manila
2          Makati
3        San Juan
4          Burgos
           ...   
53957      Manila
53958      Manila
53959      Manila
53960     Malabon
53961     Malabon
Name: City, Length: 53962, dtype: object

In [13]:
clinics_df['City'].unique()

array(['Taguig', 'Manila', 'Makati', 'San Juan', 'Burgos', 'Butuan City',
       'Basco', 'Quezon City', 'Mandaue City', 'Santa Rosa City',
       'San Fernando City', 'Santiago City', 'Batangas City',
       'Iloilo City', 'Botolan', 'Las Piñas', 'Balabac', 'Pasig', 'Bilar',
       'Malabon', 'Caloocan', 'Muntinlupa', 'Candijay', 'San Nicolas',
       'Calape', 'Cebu City', 'Manito', 'Pasay', 'Cabanatuan City',
       'Antipolo City', 'Taytay', 'Cainta', 'Mandaluyong', 'Marikina',
       'San Leonardo', 'Dasmariñas City', 'Bacoor City', 'Malolos City',
       'Parañaque', 'Imus City', 'Silang', 'San Jose del Monte City',
       'San Mateo', 'Davao City', 'Kidapawan City', 'Tagbilaran City',
       'General Trias', 'Balamban', 'Bacolod City', 'Cagayan de Oro',
       'Guagua', 'Lubao', 'Zamboanga City', 'Rizal', 'Santa Cruz',
       'Asuncion', 'Tagum City', 'Cardona', 'Morong', 'Tanay', 'Teresa',
       'Meycauayan City', 'Santo Domingo', 'San Jose City',
       'Trece Martires City',

In [18]:
# step 1: check for missing values
missing_city = clinics_df['City'].isnull().sum()

# step 2: check number of values with special characters
special_char_count = clinics_df['City'].apply(lambda x: bool(re.search('[^a-zA-Z0-9\s]', x))).sum()

print("no. of missing values:", missing_city)
print("no. of values with special characters:", special_char_count)

no. of missing values: 0
no. of values with special characters: 2142


In [21]:
# to address values with special characters
# iterate through each city and identify special characters
for city in clinics_df['City']:
    special_chars = re.findall(r'[^a-zA-Z0-9\s]', str(city))
    if special_chars:
        print(f"City: {city}, Special Characters: {', '.join(special_chars)}")

City: Las Piñas, Special Characters: ñ
City: Las Piñas, Special Characters: ñ
City: Las Piñas, Special Characters: ñ
City: Las Piñas, Special Characters: ñ
City: Dasmariñas City, Special Characters: ñ
City: Parañaque, Special Characters: ñ
City: Las Piñas, Special Characters: ñ
City: Dasmariñas City, Special Characters: ñ
City: Dasmariñas City, Special Characters: ñ
City: Parañaque, Special Characters: ñ
City: Dasmariñas City, Special Characters: ñ
City: Dasmariñas City, Special Characters: ñ
City: Dasmariñas City, Special Characters: ñ
City: Dasmariñas City, Special Characters: ñ
City: Dasmariñas City, Special Characters: ñ
City: Las Piñas, Special Characters: ñ
City: Las Piñas, Special Characters: ñ
City: Las Piñas, Special Characters: ñ
City: Lapu-Lapu City, Special Characters: -
City: Dasmariñas City, Special Characters: ñ
City: Dasmariñas City, Special Characters: ñ
City: Parañaque, Special Characters: ñ
City: Parañaque, Special Characters: ñ
City: Las Piñas, Special Characters: ñ

In [23]:
# since most of the values contain ñ, convert this to n
clinics_df_copy = clinics_df
clinics_df_copy['City'] = clinics_df_copy['City'].str.replace('ñ', 'n')

# iterate through each city and identify special characters
for city in clinics_df_copy['City']:
    special_chars = re.findall(r'[^a-zA-Z0-9\s]', str(city))
    if special_chars:
        print(f"City: {city}, Special Characters: {', '.join(special_chars)}")

City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Sanchez-Mira, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Libjo (Albor), Special Characters: (, )
City: M'lang, Special Characters: '
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: M'lang, Special Characters: '
City: Lapu-Lapu City, Special Characters: -
City: Anini-y, Special Characters: -
City: Anini-y, Special Characters: -
City: Anini-y, Special Characters: -
City: Anini-y, Special Characters: -
City: Anini-y, Special Characters: -
City: Anini-y, Special Characters: -
Ci

In [24]:
# copy the duplicate to original
clinics_df = clinics_df_copy
for city in clinics_df['City']:
    special_chars = re.findall(r'[^a-zA-Z0-9\s]', str(city))
    if special_chars:
        print(f"City: {city}, Special Characters: {', '.join(special_chars)}")

City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Sanchez-Mira, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Libjo (Albor), Special Characters: (, )
City: M'lang, Special Characters: '
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: Lapu-Lapu City, Special Characters: -
City: M'lang, Special Characters: '
City: Lapu-Lapu City, Special Characters: -
City: Anini-y, Special Characters: -
City: Anini-y, Special Characters: -
City: Anini-y, Special Characters: -
City: Anini-y, Special Characters: -
City: Anini-y, Special Characters: -
City: Anini-y, Special Characters: -
Ci