In [37]:
import pandas as pd
import chardet
from geopy.geocoders import Nominatim
import time

odhf_data = 'csv files/odhf_v1.1.csv'
population_data = 'csv files/population census.csv'

In [10]:
with open(odhf_data, 'rb') as f:                 #finding the encoding type so I can read the csv file
    rawdata = f.read(10000)               
    result = chardet.detect(rawdata)
    print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [None]:
df = pd.read_csv(odhf_data, encoding='Windows-1252')

missing_data = df.isnull().sum()
print(missing_data[missing_data > 0])                       # identifying missing data. 

source_facility_type         1350
unit                         7028
street_no                     594
street_name                   520
city                           23
source_format_str_address    3098
CSDname                        39
CSDuid                        484
latitude                      484
longitude                     484
dtype: int64


In [None]:
missing_city_df = df[df['city'].isnull()]
missing_city_df.to_csv('missing_city_data.csv', index=False)            # create a csv file of all rows with missing cities. Then manually search and add cities based on geographical coordinates.

In [17]:
df_cleaned = df.dropna(subset=['city'])                                                 # removing rows without cities from the original dataset
updated_missing_city_df = pd.read_csv('csv files/missing_city_data.csv')
combined_df = pd.concat([df_cleaned, updated_missing_city_df], ignore_index=True)       # combining missing city data with original dataset to create a new, cleaned dataset
combined_df.to_csv('combined_healthcare_data.csv', index=False)

In [33]:
df = pd.read_csv('combined_healthcare_data.csv')

missing_data = df.isnull().sum()
print(missing_data[missing_data > 0])    

source_facility_type         1350
unit                         7028
street_no                     594
street_name                   520
source_format_str_address    3098
CSDname                        39
CSDuid                        484
latitude                      484
longitude                     484
dtype: int64


In [None]:
odhf_data = 'csv files/combined_healthcare_data.csv'
odhf_df = pd.read_csv(odhf_data)

geolocator = Nominatim(user_agent="odhf_geocoder")

def geocode_city_province(row):
    location = geolocator.geocode(f"{row['city']}, {row['province']}, Canada")              # function to use geocode on each row with a missing latitude/longitude using the city/province rows data.
    if location:
        return pd.Series([location.latitude, location.longitude])
    else:
        return pd.Series([None, None])


for idx, row in odhf_df[odhf_df['latitude'].isnull()].iterrows():                           # applying the function
    lat, lon = geocode_city_province(row)
    odhf_df.at[idx, 'latitude'] = lat
    odhf_df.at[idx, 'longitude'] = lon
    time.sleep(1)                                                                           # using a 1 second delay due to Nominatim's rate limit

odhf_df.to_csv('odhf_geocoded.csv', index=False)

In [47]:
odhf_cleaned_df = pd.read_csv('csv files/odhf_geocoded.csv')
odhf_cleaned_df = odhf_cleaned_df.drop(columns=['CSDname', 'CSDuid', 'Pruid', 'source_format_str_address'])     # dropping irrelevent columns for the final csv file to simplify data
odhf_cleaned_df.rename(columns={'city': 'geographic_name'}, inplace=True)                                       # renaming 'city' to 'geographic name' to normalize with the same column in the population census database
odhf_cleaned_df.to_csv('odhf_cleaned.csv', index=False)
