In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# sets the theme of the charts
plt.style.use('seaborn-v0_8-darkgrid')

%matplotlib inline

In [2]:
# imports the csv files
px_df = pd.read_csv('px.csv', low_memory=False)
doctors_df = pd.read_csv('doctors.csv', low_memory=False, encoding='unicode_escape')
clinics_df = pd.read_csv('clinics.csv', low_memory=False, encoding='unicode_escape')
appointments_df = pd.read_csv('appointments.csv', low_memory=False, encoding='unicode_escape')
locations_df = pd.read_csv('dim_locations.csv', low_memory=False, encoding='unicode_escape')

In [3]:
# checks the content of the csv files
px_df.info()
doctors_df.info()
clinics_df.info()
appointments_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6507813 entries, 0 to 6507812
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   pxid    object
 1   age     object
 2   gender  object
dtypes: object(3)
memory usage: 149.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60024 entries, 0 to 60023
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   doctorid       60024 non-null  object 
 1   mainspecialty  27055 non-null  object 
 2   age            20028 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53962 entries, 0 to 53961
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   clinicid      53962 non-null  object
 1   hospitalname  17538 non-null  object
 2   IsHospital    53962 non-null  bool  
 3   City          53962 non-null  object
 4   P

In [4]:
locations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 809 entries, 0 to 808
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   LocationId  808 non-null    object
 1   City        808 non-null    object
 2   Province    808 non-null    object
 3   RegionName  808 non-null    object
dtypes: object(4)
memory usage: 25.4+ KB


## Location Dataset
1. `City`: names of the cities (no missing values)
2. `Province`: names of the provinces (no missing values)
3. `RegionName` names of the regions (no missing values)

In [7]:
# check province per region
province_per_region = locations_df.groupby('RegionName')['Province'].unique()

for region, provinces in province_per_region.items():
    print(f"Region: {region}")
    for province in provinces:
        print(f"- {province}")
    print()  # Add a newline between regions

Region: Bangsamoro Autonomous Region in Muslim Mindanao (BARMM)
- Basilan
- Lanao del Sur
- Maguindanao
- Sulu

Region: Bicol Region (V)
- Camarines Norte
- Camarines Sur
- Catanduanes
- Albay
- Masbate
- Sorsogon

Region: CALABARZON (IV-A)
- Batangas
- Cavite
- Laguna
- Angeles
- Quezon
- Rizal

Region: Cagayan Valley (II)
- Batanes
- Cagayan
- Albay
- Nueva Vizcaya
- Quirino
- Isabela

Region: Caraga (XIII)
- Dinagat Islands
- Agusan del Norte
- Agusan del Sur
- Surigao del Norte
- Surigao del Sur

Region: Central Luzon (III)
- Bulacan
- Bataan
- Nueva Ecija
- Pampanga
- Aurora
- Tarlac
- Zambales

Region: Central Visayas (VII)
- Bohol
- Cebu
- Negros Oriental
- Siquijor

Region: Cordillera Administrative Region (CAR)
- Benguet
- Abra
- Ifugao
- Kalinga
- Mountain Province
- Apayao

Region: Davao Region (XI)
- Compostela Valley
- Davao Occidental
- Davao del Norte
- Davao del Sur
- Davao Oriental
- Manila

Region: Eastern Visayas (VIII)
- Biliran
- Leyte
- Northern Samar
- Samar
- So

In [8]:
# duplicate dataframe
locations_df_copy = locations_df

In [24]:
# outliers
# CALABARZON - Angeles
# REGION II - Albay
# REGION III - insert Angeles
# REGION XI - Composela Valley, Manila
# NCR - Abra, Camarines Sur, Cavite

# Remove Angeles from CALABARZON and insert Angeles in REGION III
province_to_transfer = ['Angeles']
locations_df_copy.loc[locations_df_copy['Province'].isin(province_to_transfer), 'RegionName'] = 'Central Luzon (III)'

transfer_Albay = ['Albay']
locations_df_copy.loc[locations_df_copy['Province'].isin(transfer_Albay), 'RegionName'] = 'Bicol Region (V)'

transfer_Manila = ['Manila']
locations_df_copy.loc[locations_df_copy['Province'].isin(transfer_Manila), 'RegionName'] = 'National Capital Region (NCR)'

transfer_Abra = ['Abra']
locations_df_copy.loc[locations_df_copy['Province'].isin(transfer_Abra), 'RegionName'] = 'Cordillera Administrative Region (CAR)'

transfer_Camarines_Sur = ['Camarines Sur']
locations_df_copy.loc[locations_df_copy['Province'].isin(transfer_Camarines_Sur), 'RegionName'] = 'Bicol Region (V)'

transfer_Cavite = ['Cavite']
locations_df_copy.loc[locations_df_copy['Province'].isin(transfer_Cavite), 'RegionName'] = 'CALABARZON (IV-A)'
    
# change Compostela Valley to Davao de Oro
locations_df_copy.loc[locations_df_copy['Province'] == 'Compostela Valley'] = 'Davao de Oro'
locations_df_copy.loc[locations_df_copy['Province'] == 'Davao de Oro', 'RegionName'] = 'Davao Region (XI)'


# check changes
province_per_region_copy = locations_df_copy.groupby('RegionName')['Province'].unique()

for region, provinces in province_per_region_copy.items():
    print(f"Region: {region}")
    for province in provinces:
        print(f"- {province}")
    print()  # Add a newline between regions

Region: Bangsamoro Autonomous Region in Muslim Mindanao (BARMM)
- Basilan
- Lanao del Sur
- Maguindanao
- Sulu

Region: Bicol Region (V)
- Albay
- Camarines Sur
- Camarines Norte
- Catanduanes
- Masbate
- Sorsogon

Region: CALABARZON (IV-A)
- Batangas
- Cavite
- Laguna
- Quezon
- Rizal

Region: Cagayan Valley (II)
- Batanes
- Cagayan
- Nueva Vizcaya
- Quirino
- Isabela

Region: Caraga (XIII)
- Dinagat Islands
- Agusan del Norte
- Agusan del Sur
- Surigao del Norte
- Surigao del Sur

Region: Central Luzon (III)
- Bulacan
- Bataan
- Nueva Ecija
- Pampanga
- Aurora
- Tarlac
- Zambales
- Angeles

Region: Central Visayas (VII)
- Bohol
- Cebu
- Negros Oriental
- Siquijor

Region: Cordillera Administrative Region (CAR)
- Benguet
- Abra
- Ifugao
- Kalinga
- Mountain Province
- Apayao

Region: Davao Region (XI)
- Davao de Oro
- Davao Occidental
- Davao del Norte
- Davao del Sur
- Davao Oriental

Region: Eastern Visayas (VIII)
- Biliran
- Leyte
- Northern Samar
- Samar
- Southern Leyte
- Eastern

In [26]:
locations_df = locations_df_copy

In [27]:
# export to csv
locations_df.to_csv('dim_locations_cleaned.csv', index=False)