In [1]:
import numpy as np
import pandas as pd
import pgeocode as pg
import requests as rq
import os
from utils import listdir_nohidden, antibiotics

pd.options.mode.chained_assignment = None

nomi = pg.Nominatim('gb')

path = 'raw_hosp/'

def get_coords(month):
    address = pd.read_csv(path+month+'/Address.csv', 
                        on_bad_lines='skip',
                        header=0,
                        names=['Period','Id','Name','Street','Area','Posttown','County','Postcode'])

    zips = address['Postcode'].to_numpy()
    geo_query1 = nomi.query_postal_code(zips)

    address['lat'] = geo_query1['latitude']
    address['long'] = geo_query1['longitude']

    for idx, row in address.iterrows(): # attempts to scrub coords from area if postcode missing
        if pd.isnull(row['lat']):
            loc = row['Area']
            geo_query2 = nomi.query_location(loc)
            geo_query2 = geo_query2.loc[geo_query2['state_name']=='Wales']
            if len(geo_query2)>0:
                address['lat'][idx] = geo_query2['latitude'].iloc[0]
                address['long'][idx] = geo_query2['longitude'].iloc[0]

    return address

month_folders = listdir_nohidden(path)

addresses = []
BNFs = []
ChemSubstances = []
gp_datas = []

for month in month_folders:
    bnf = pd.read_csv(path+month+'/BNF.csv', on_bad_lines='skip')
    chem = pd.read_csv(path+month+'/ChemSubstance.csv', on_bad_lines='skip')

    gp_path = [f for f in os.listdir(path+month) if f.startswith("HospitalData")][0]
    gp_data = pd.read_csv(path+month+'/'+gp_path, on_bad_lines='skip')

    antibiotic_mask = gp_data['BNFCode'].str.startswith(tuple(antibiotics.values()))
    gp_data = gp_data.loc[antibiotic_mask]

    address = get_coords(month)

    addresses.append(address)
    BNFs.append(bnf)
    ChemSubstances.append(chem)
    gp_datas.append(gp_data)

    print(gp_path+' done!')

addresses = pd.concat(addresses, axis=0)
gp_datas = pd.concat(gp_datas, axis=0)
BNFs = pd.concat(BNFs, axis=0)
ChemSubstances = pd.concat(ChemSubstances, axis=0)

HospitalData202112.csv done!
HospitalData201809.csv done!
HospitalData201704.csv done!
HospitalData201909.csv done!
HospitalData201709.csv done!
HospitalData201902.csv done!
HospitalData202207.csv done!
HospitalData201706.csv done!
HospitalData201903.csv done!
HospitalData202003.csv done!
HospitalData201906.csv done!
HospitalData202004.csv done!
HospitalData202209.csv done!
HospitalData201802.csv done!
HospitalData202002.csv done!
HospitalData202005.csv done!
HospitalData201806.csv done!
HospitalData202208.csv done!
HospitalData201803.csv done!
HospitalData202206.csv done!
HospitalData202201.csv done!
HospitalData202011.csv done!
HospitalData201710.csv done!
HospitalData201910.csv done!
HospitalData202010.csv done!
HospitalData201810.csv done!
HospitalData202107.csv done!
HospitalData202109.csv done!
HospitalData202108.csv done!
HospitalData202106.csv done!
HospitalData202101.csv done!
HospitalData202111.csv done!
HospitalData201901.csv done!
HospitalData201707.csv done!
HospitalData20

In [2]:
print(gp_datas.columns)

red_gp_data = gp_datas.drop(columns=['HB Name',
                                     'Trust Name',
                                     'Trust',
                                     'Directorate Name',
                                     'ActCost',
                                     'DDD',
                                     'ADQ',
                                     'ActCost',
                                     'Items',
                                     'NIC'])

Index(['HB', 'HB Name', 'Trust', 'Trust Name', 'Hospital', 'HosName',
       'Directorate', 'Directorate Name', 'BNFCode', 'BNFName', 'Items', 'NIC',
       'ActCost', 'Quantity', 'DDD', 'ADQ', 'Period', ' Directorate Name'],
      dtype='object')


In [3]:
print(addresses.columns)

red_addresses = addresses.drop(columns=['Street',
                                        'Area',
                                        'Posttown',
                                        'County',
                                        'Period'])

Index(['Period', 'Id', 'Name', 'Street', 'Area', 'Posttown', 'County',
       'Postcode', 'lat', 'long'],
      dtype='object')


In [4]:
merged = pd.merge(red_gp_data, red_addresses, 
                  left_on='Hospital',
                  right_on='Id',
                  how='left')

merged.head()

Unnamed: 0,HB,Hospital,HosName,Directorate,BNFCode,BNFName,Quantity,Period,Directorate Name,Id,Name,Postcode,lat,long
0,V1,V1101,Royal Gwent Hospital,V110101,0501012G0AAABAB,Fluclox Sod_Cap 500mg,404,202112,,V1101,Royal Gwent Hospital,NP20 2UB,51.5877,-2.9984
1,V1,V1101,Royal Gwent Hospital,V110101,0501012G0AAABAB,Fluclox Sod_Cap 500mg,404,202112,,V1101,Royal Gwent Hospital,NP20 2UB,51.5877,-2.9984
2,V1,V1101,Royal Gwent Hospital,V110101,0501012G0AAABAB,Fluclox Sod_Cap 500mg,404,202112,,V1101,Royal Gwent Hospital,NP20 2UB,51.5877,-2.9984
3,V1,V1101,Royal Gwent Hospital,V110101,0501012G0AAABAB,Fluclox Sod_Cap 500mg,404,202112,,V1101,Royal Gwent Hospital,NP20 2UB,51.5877,-2.9984
4,V1,V1101,Royal Gwent Hospital,V110101,0501012G0AAABAB,Fluclox Sod_Cap 500mg,404,202112,,V1101,Royal Gwent Hospital,NP20 2UB,51.5877,-2.9984


In [5]:
merged.to_csv('FilteredHospData.csv.zip', index=False, compression='gzip')