In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sets the theme of the charts
plt.style.use('seaborn-v0_8-darkgrid')

%matplotlib inline

In [21]:
# imports the csv files
px_df = pd.read_csv('px.csv', low_memory=False)
doctors_df = pd.read_csv('doctors.csv', low_memory=False, encoding='unicode_escape')
clinics_df = pd.read_csv('clinics.csv', low_memory=False, encoding='unicode_escape')
appointments_df = pd.read_csv('appointments.csv', low_memory=False, encoding='unicode_escape')
dim_clinics = pd.read_csv('dim_clinics.csv', low_memory=False, encoding='unicode_escape')

In [22]:
# checks the content of the csv files
px_df.info()
doctors_df.info()
clinics_df.info()
appointments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6507813 entries, 0 to 6507812
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   pxid    object
 1   age     object
 2   gender  object
dtypes: object(3)
memory usage: 149.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60024 entries, 0 to 60023
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   doctorid       60024 non-null  object 
 1   mainspecialty  27055 non-null  object 
 2   age            20028 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53962 entries, 0 to 53961
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   clinicid      53962 non-null  object
 1   hospitalname  17538 non-null  object
 2   IsHospital    53962 non-null  bool  
 3   City          53962 non-null  object
 4   P

In [23]:
dim_clinics['hospitalname']

0        St. Luke's Medical Center-Global City
1                 Our Lady of Lourdes Hospital
2                        Makati Medical Center
3               Cardinal Santos Medical Center
4                                          NaN
                         ...                  
53957                                      NaN
53958                                      NaN
53959                                      NaN
53960      Malabon Hospital and Medical Center
53961                                      NaN
Name: hospitalname, Length: 53962, dtype: object

In [24]:
# duplicate df
dim_clinics_copy = dim_clinics

In [25]:
# check for null values
null_clinics = dim_clinics['hospitalname'].isnull().sum()

# replace null values with NA
dim_clinics_copy['hospitalname'].fillna('NA', inplace=True)
null_clinics_after = dim_clinics_copy['hospitalname'].isnull().sum()

print("Null values:", null_clinics)
print("Null values after:", null_clinics_after)

Null values: 36424
Null values after: 0


In [32]:
# remove trailing whitespace
dim_clinics_copy['hospitalname'] = dim_clinics_copy['hospitalname'].str.strip()

# check total rows with special characters
special_characters = dim_clinics_copy[dim_clinics_copy['hospitalname'].str.contains(r'[^\x00-\x7F]+', na=False)]

# identify the special characters
special_characters['hospitalname'].unique()

array(['Unihealth-ParaÃ\x83Â\x83Ã\x82Â±aque Hospital and Medical Center',
       'MCUÃ\x83Â\x82Ã\x82Â\x96Filemon Dionisio Tanchoco Medical Foundation Hospital',
       "Las PiÃ\x83Â\x83Ã\x82Â±as Doctor's Hospital",
       "ParaÃ\x83Â\x83Ã\x82Â±aque Doctor's Hospital",
       'Las PiÃ\x83Â\x83Ã\x82Â±as City Medical Center',
       'Perpetual Help Hospital BiÃ\x83Â\x83Ã\x82Â±an',
       'Ospital ng BiÃ\x83Â\x83Ã\x82Â±an',
       'Medical Center ParaÃ\x83Â\x83Ã\x82Â±aque, Inc.',
       'Ospital ng ParaÃ\x83Â\x83Ã\x82Â±aque',
       'BiÃ\x83Â\x83Ã\x82Â±an Doctors Hospital, Inc.',
       'Healthserv Los BaÃ\x83Â\x83Ã\x82Â±os Medical Center',
       'Los BaÃ\x83Â\x83Ã\x82Â±os Doctors Hospital, Inc.',
       'Las PiÃ\x83Â\x83Ã\x82Â±as General Hospital and Satellite Trauma Center',
       'University Health Service-UPLB College, Los BaÃ\x83Â\x83Ã\x82Â±os',
       'DasmariÃ\x83Â\x83Ã\x82Â±as City Medical Center',
       'Alfredo E. MaraÃ\x83Â\x83Ã\x82Â±on, Sr. Memorial District Hospital',
     

In [35]:
# replace special characters with "n"
dim_clinics_copy['hospitalname'] = dim_clinics_copy['hospitalname'].replace({r'[^\x00-\x7F]+':'n'}, regex=True)

# identify the special characters
special_characters = dim_clinics_copy[dim_clinics_copy['hospitalname'].str.contains(r'[^\x00-\x7F]+', na=False)]
special_characters['hospitalname'].unique()

array([], dtype=object)

In [40]:
dim_clinics = dim_clinics_copy

In [42]:
# export the cleaned data
dim_clinics.to_csv('dim_clinics_cleaned.csv', index=False)