In [10]:
import numpy as np
import pandas as pd
import pgeocode as pg
import requests as rq
import os
from utils import listdir_nohidden, assign_types

pd.options.mode.chained_assignment = None

nomi = pg.Nominatim('gb')

path = 'raw_hosp/'

def get_coords(month):
    address = pd.read_csv(path+month+'/Address.csv', 
                        on_bad_lines='skip',
                        header=0,
                        names=['Period','Id','Name','Street','Area','Posttown','County','Postcode'],
                        )[['Id', 'Area', 'Postcode']]

    zips = address['Postcode'].to_numpy()
    geo_query1 = nomi.query_postal_code(zips)

    address['lat'] = geo_query1['latitude']
    address['long'] = geo_query1['longitude']

    for idx, row in address.iterrows(): # attempts to scrub coords from area if postcode missing
        if pd.isnull(row['lat']):
            loc = row['Area']
            geo_query2 = nomi.query_location(loc)
            geo_query2 = geo_query2.loc[geo_query2['state_name']=='Wales']
            if len(geo_query2)>0:
                address['lat'][idx] = geo_query2['latitude'].iloc[0]
                address['long'][idx] = geo_query2['longitude'].iloc[0]

    return address

month_folders = listdir_nohidden(path)

addresses = []
hosp_datas = []

for month in month_folders:
    hosp_path = [f for f in os.listdir(path+month) if f.startswith("HospitalData")][0]
    hosp_data = pd.read_csv(path+month+'/'+hosp_path, 
                          usecols=['Period', 'BNFCode', 'Hospital', 'Quantity','DDD'])
    
    hosp_data['Quantity/DDD'] = hosp_data['Quantity']/hosp_data['DDD']

    hosp_data = assign_types(hosp_data)

    address = get_coords(month)

    addresses.append(address)
    hosp_datas.append(hosp_data)

    print(hosp_path+' done!')

addresses = pd.concat(addresses, axis=0)
hosp_datas = pd.concat(hosp_datas, axis=0)

HospitalData202112.csv done!
HospitalData201809.csv done!
HospitalData201704.csv done!
HospitalData201909.csv done!
HospitalData201709.csv done!
HospitalData201902.csv done!
HospitalData202207.csv done!
HospitalData201706.csv done!
HospitalData201903.csv done!
HospitalData202003.csv done!
HospitalData201906.csv done!
HospitalData202004.csv done!
HospitalData202209.csv done!
HospitalData201802.csv done!
HospitalData202002.csv done!
HospitalData202005.csv done!
HospitalData201806.csv done!
HospitalData202208.csv done!
HospitalData201803.csv done!
HospitalData202206.csv done!
HospitalData202201.csv done!
HospitalData202011.csv done!
HospitalData202212.csv done!
HospitalData201710.csv done!
HospitalData201910.csv done!
HospitalData202010.csv done!
HospitalData201810.csv done!
HospitalData202107.csv done!
HospitalData202109.csv done!
HospitalData202108.csv done!
HospitalData202106.csv done!
HospitalData202101.csv done!
HospitalData202111.csv done!
HospitalData201901.csv done!
HospitalData20

In [11]:
merged = pd.merge(hosp_datas, addresses, 
                  left_on='Hospital',
                  right_on='Id',
                  how='left').drop(columns=['Id', 'Area'])

merged.head()

Unnamed: 0,Hospital,BNFCode,Quantity,DDD,Period,Quantity/DDD,type,Postcode,lat,long
0,V1101,0501012G0AAABAB,404.0,101.0,202112.0,4.0,penicillins,NP20 2UB,51.5877,-2.9984
1,V1101,0501012G0AAABAB,404.0,101.0,202112.0,4.0,penicillins,NP20 2UB,51.5877,-2.9984
2,V1101,0501012G0AAABAB,404.0,101.0,202112.0,4.0,penicillins,NP20 2UB,51.5877,-2.9984
3,V1101,0501012G0AAABAB,404.0,101.0,202112.0,4.0,penicillins,NP20 2UB,51.5877,-2.9984
4,V1101,0501012G0AAABAB,404.0,101.0,202112.0,4.0,penicillins,NP20 2UB,51.5877,-2.9984


In [12]:
print(merged['Quantity/DDD'].isnull().sum())

merged.head()

0


Unnamed: 0,Hospital,BNFCode,Quantity,DDD,Period,Quantity/DDD,type,Postcode,lat,long
0,V1101,0501012G0AAABAB,404.0,101.0,202112.0,4.0,penicillins,NP20 2UB,51.5877,-2.9984
1,V1101,0501012G0AAABAB,404.0,101.0,202112.0,4.0,penicillins,NP20 2UB,51.5877,-2.9984
2,V1101,0501012G0AAABAB,404.0,101.0,202112.0,4.0,penicillins,NP20 2UB,51.5877,-2.9984
3,V1101,0501012G0AAABAB,404.0,101.0,202112.0,4.0,penicillins,NP20 2UB,51.5877,-2.9984
4,V1101,0501012G0AAABAB,404.0,101.0,202112.0,4.0,penicillins,NP20 2UB,51.5877,-2.9984


In [13]:
merged.to_csv('FilteredHospData.csv.zip', index=False, compression='gzip')