In [1]:
import numpy as np
import pandas as pd
import pgeocode as pg
import requests as rq
import os
from utils import listdir_nohidden, antibiotics

pd.options.mode.chained_assignment = None

nomi = pg.Nominatim('gb')

path = 'raw_hosp/'

def get_coords(month):
    address = pd.read_csv(path+month+'/Address.csv', 
                        on_bad_lines='skip',
                        header=0,
                        names=['Period','Id','Name','Street','Area','Posttown','County','Postcode'])

    zips = address['Postcode'].to_numpy()
    geo_query1 = nomi.query_postal_code(zips)

    address['lat'] = geo_query1['latitude']
    address['long'] = geo_query1['longitude']

    for idx, row in address.iterrows(): # attempts to scrub coords from area if postcode missing
        if pd.isnull(row['lat']):
            loc = row['Area']
            geo_query2 = nomi.query_location(loc)
            geo_query2 = geo_query2.loc[geo_query2['state_name']=='Wales']
            if len(geo_query2)>0:
                address['lat'][idx] = geo_query2['latitude'].iloc[0]
                address['long'][idx] = geo_query2['longitude'].iloc[0]

    return address

month_folders = listdir_nohidden(path)

addresses = []
BNFs = []
ChemSubstances = []
gp_datas = []

for month in month_folders:
    bnf = pd.read_csv(path+month+'/BNF.csv', on_bad_lines='skip')
    chem = pd.read_csv(path+month+'/ChemSubstance.csv', on_bad_lines='skip')

    gp_path = [f for f in os.listdir(path+month) if f.startswith("HospitalData")][0]
    gp_data = pd.read_csv(path+month+'/'+gp_path, on_bad_lines='skip')

    antibiotic_mask = gp_data['BNFCode'].str.startswith(tuple(antibiotics.values()))
    gp_data = gp_data.loc[antibiotic_mask]

    address = get_coords(month)

    addresses.append(address)
    BNFs.append(bnf)
    ChemSubstances.append(chem)
    gp_datas.append(gp_data)

    print(gp_path+' done!')

addresses = pd.concat(addresses, axis=0)
gp_datas = pd.concat(gp_datas, axis=0)
BNFs = pd.concat(BNFs, axis=0)
ChemSubstances = pd.concat(ChemSubstances, axis=0)

Attempting HospitalData202112
done!
Attempting Hospital Data Extract - September 2018
done!
Attempting Hospital Data Extract - Apr 2017
done!
Attempting Hospital Data Extract - September 2019
done!
Attempting Hospital Data Extract - September 2017
done!
Attempting Hospital Data Extract - February 2019
done!
Attempting HospitalData202207
done!
Attempting Hospital Data Extract - June 2017
done!
Attempting Hospital Data Extract - March 2019
done!
Attempting HospitalData202003
done!
Attempting Hospital Data Extract - June 2019
done!
Attempting HospitalData202004
done!
Attempting HospitalData202209
done!
Attempting Hospital Data Extract - February 2018
done!
Attempting Hospital Data Extract - February 2020
done!
Attempting HospitalData202005
done!
Attempting Hospital Data Extract - June 2018
done!
Attempting HospitalData202208
done!
Attempting Hospital Data Extract - March 2018
done!
Attempting HospitalData202206
done!
Attempting HospitalData202201
done!
Attempting HospitalData202011
done!


In [11]:
print(gp_datas.columns)

red_gp_data = gp_datas.drop(columns=['HB Name',
                                     'Trust Name',
                                     'Trust',
                                     'Directorate Name',
                                     'ActCost',
                                     'DDD',
                                     'ADQ',
                                     'ActCost',
                                     'Items',
                                     'NIC'])

Index(['HB', 'HB Name', 'Trust', 'Trust Name', 'Hospital', 'HosName',
       'Directorate', 'Directorate Name', 'BNFCode', 'BNFName', 'Items', 'NIC',
       'ActCost', 'Quantity', 'DDD', 'ADQ', 'Period', ' Directorate Name'],
      dtype='object')


In [12]:
print(addresses.columns)

red_addresses = addresses.drop(columns=['Street',
                                        'Area',
                                        'Posttown',
                                        'County'])

Index(['Period', 'Id', 'Name', 'Street', 'Area', 'Posttown', 'County',
       'Postcode', 'lat', 'long'],
      dtype='object')


In [13]:
merged = pd.merge(red_gp_data, red_addresses, 
                  left_on='Hospital',
                  right_on='Id',
                  how='left')

merged.head()

Unnamed: 0,HB,Hospital,HosName,Directorate,BNFCode,BNFName,Quantity,Period_x,Directorate Name,Period_y,Id,Name,Postcode,lat,long
0,V1,V1101,Royal Gwent Hospital,V110101,0501012G0AAABAB,Fluclox Sod_Cap 500mg,404,202112,,202112,V1101,Royal Gwent Hospital,NP20 2UB,51.5877,-2.9984
1,V1,V1101,Royal Gwent Hospital,V110101,0501012G0AAABAB,Fluclox Sod_Cap 500mg,404,202112,,201809,V1101,Royal Gwent Hospital,NP20 2UB,51.5877,-2.9984
2,V1,V1101,Royal Gwent Hospital,V110101,0501012G0AAABAB,Fluclox Sod_Cap 500mg,404,202112,,201704,V1101,Royal Gwent Hospital,NP20 2UB,51.5877,-2.9984
3,V1,V1101,Royal Gwent Hospital,V110101,0501012G0AAABAB,Fluclox Sod_Cap 500mg,404,202112,,201909,V1101,Royal Gwent Hospital,NP20 2UB,51.5877,-2.9984
4,V1,V1101,Royal Gwent Hospital,V110101,0501012G0AAABAB,Fluclox Sod_Cap 500mg,404,202112,,201709,V1101,Royal Gwent Hospital,NP20 2UB,51.5877,-2.9984


In [14]:
merged.to_csv('FilteredHospData.csv.zip', index=False, compression='gzip')