# Processing referrals dataset

Tasks:
* Format weeks as single datetime values, rather than week range
* Rename columns for convenience
* Remove referrals from organisations other than CCGs
* Account for the merging of CCGs in 2020, by mapping 2019 CCG codes to their 2020 equivalents
* Eliminate clinic_type field, i.e. group by other fields and sum
* Remove CCG namec column

In [259]:
# Importing Python packages we are likely to need
import pandas as pd  # data tables
import numpy as np  # numerical operations
import os

ROOT_DIR = "~/cf/coding_club/python_club"
DATA_DIR = os.path.join(ROOT_DIR, 'data')
OUT_DIR = os.path.join(ROOT_DIR, 'outputs')
# CCG_STP_REG_MAP_FILE = os.path.join(DATA_DIR, f'Clinical_Commissioning_Group_to_STP_and_NHS_England_(Region)_(April_2020)_Lookup_in_England.csv')
WRITE = False
REFS_FILE = "referrals.cat.oct19_dec20.csv"
REFS_PATH = os.path.join(DATA_DIR, REFS_FILE)

In [260]:
ccg_mergers = pd.read_csv(os.path.join(DATA_DIR, "ccg_mergers_2019_2020.csv")).drop(columns=["Legacy (closing) CCG Name", "New CCG Name"])
ccg_19 = pd.read_csv(os.path.join(DATA_DIR, "Clinical_Commissioning_Groups_(April_2019)_Names_and_Codes_in_England.csv"))
ccg_20 = pd.read_csv(os.path.join(DATA_DIR, "Clinical_Commissioning_Groups_(April_2020)_Names_and_Codes_in_England.csv"))

In [261]:
df = pd.read_csv(REFS_PATH)
df

Unnamed: 0,Week,CCG_Code,CCG_Name,Priority,Specialty,Clinic_Type,Referrals
0,07/10/2019 - 13/10/2019,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Breast,10
1,07/10/2019 - 13/10/2019,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Gynaecology,8
2,07/10/2019 - 13/10/2019,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Haematology,1
3,07/10/2019 - 13/10/2019,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Head and Neck,2
4,07/10/2019 - 13/10/2019,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Lower GI,22
...,...,...,...,...,...,...,...
728918,21/12/2020 - 27/12/2020,DMT,FEDBUCKS LIMITED,Urgent,Gynaecology,Other_clinic,1
728919,21/12/2020 - 27/12/2020,DR3,COASTAL HEALTH GP PROVIDER ORGANISATION,2 Week Wait,2WW,2WW Breast,1
728920,21/12/2020 - 27/12/2020,DR3,COASTAL HEALTH GP PROVIDER ORGANISATION,2 Week Wait,2WW,2WW Skin,1
728921,21/12/2020 - 27/12/2020,DR3,COASTAL HEALTH GP PROVIDER ORGANISATION,2 Week Wait,2WW,2WW Urology,1


### Rename columns

In [262]:
rename_dict = dict(zip(df.columns, [col.lower() for col in df.columns]))
df = df.rename(columns=rename_dict)
df

Unnamed: 0,week,ccg_code,ccg_name,priority,specialty,clinic_type,referrals
0,07/10/2019 - 13/10/2019,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Breast,10
1,07/10/2019 - 13/10/2019,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Gynaecology,8
2,07/10/2019 - 13/10/2019,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Haematology,1
3,07/10/2019 - 13/10/2019,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Head and Neck,2
4,07/10/2019 - 13/10/2019,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Lower GI,22
...,...,...,...,...,...,...,...
728918,21/12/2020 - 27/12/2020,DMT,FEDBUCKS LIMITED,Urgent,Gynaecology,Other_clinic,1
728919,21/12/2020 - 27/12/2020,DR3,COASTAL HEALTH GP PROVIDER ORGANISATION,2 Week Wait,2WW,2WW Breast,1
728920,21/12/2020 - 27/12/2020,DR3,COASTAL HEALTH GP PROVIDER ORGANISATION,2 Week Wait,2WW,2WW Skin,1
728921,21/12/2020 - 27/12/2020,DR3,COASTAL HEALTH GP PROVIDER ORGANISATION,2 Week Wait,2WW,2WW Urology,1


In [263]:
df['ccg_code'].nunique()

341

### Reformat dates

In [264]:
df[['week_start', 'week_end']] = df['week'].str.split(' - ', expand=True)
df.drop(columns=['week', 'week_end'], inplace=True)
df['week_start'] = pd.to_datetime(df['week_start'], dayfirst=True)
df

Unnamed: 0,ccg_code,ccg_name,priority,specialty,clinic_type,referrals,week_start
0,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Breast,10,2019-10-07
1,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Gynaecology,8,2019-10-07
2,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Haematology,1,2019-10-07
3,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Head and Neck,2,2019-10-07
4,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Lower GI,22,2019-10-07
...,...,...,...,...,...,...,...
728918,DMT,FEDBUCKS LIMITED,Urgent,Gynaecology,Other_clinic,1,2020-12-21
728919,DR3,COASTAL HEALTH GP PROVIDER ORGANISATION,2 Week Wait,2WW,2WW Breast,1,2020-12-21
728920,DR3,COASTAL HEALTH GP PROVIDER ORGANISATION,2 Week Wait,2WW,2WW Skin,1,2020-12-21
728921,DR3,COASTAL HEALTH GP PROVIDER ORGANISATION,2 Week Wait,2WW,2WW Urology,1,2020-12-21


In [265]:
# confirm no dups
df.shape, df.drop_duplicates().shape

((728923, 7), (728923, 7))

### Remove organisations other than CCGs

In [266]:
ccg_codes = sorted(list(set(ccg_20['CCG20CDH']) | set(ccg_19['CCG19CDH']))) # all ccg codes I'd expect to be in referrals table
df = df[df['ccg_code'].isin(ccg_codes)]  # excluding non-ccgs
df

Unnamed: 0,ccg_code,ccg_name,priority,specialty,clinic_type,referrals,week_start
0,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Breast,10,2019-10-07
1,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Gynaecology,8,2019-10-07
2,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Haematology,1,2019-10-07
3,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Head and Neck,2,2019-10-07
4,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Lower GI,22,2019-10-07
...,...,...,...,...,...,...,...
728529,97R,NHS EAST SUSSEX CCG,Urgent,Sleep Medicine,Other_clinic,1,2020-12-21
728530,97R,NHS EAST SUSSEX CCG,Urgent,Surgery - Not Otherwise Specified,Other_clinic,4,2020-12-21
728531,97R,NHS EAST SUSSEX CCG,Urgent,Surgery - Vascular,Other_clinic,6,2020-12-21
728532,97R,NHS EAST SUSSEX CCG,Urgent,Urology,Other_clinic,15,2020-12-21


### Allow for CCG mergers

In [267]:
df = pd.merge(df, ccg_mergers, how='left', left_on='ccg_code', right_on='CCG19CDH')
df

Unnamed: 0,ccg_code,ccg_name,priority,specialty,clinic_type,referrals,week_start,CCG19CDH,CCG20CDH
0,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Breast,10,2019-10-07,00C,16C
1,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Gynaecology,8,2019-10-07,00C,16C
2,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Haematology,1,2019-10-07,00C,16C
3,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Head and Neck,2,2019-10-07,00C,16C
4,00C,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Lower GI,22,2019-10-07,00C,16C
...,...,...,...,...,...,...,...,...,...
696703,97R,NHS EAST SUSSEX CCG,Urgent,Sleep Medicine,Other_clinic,1,2020-12-21,,
696704,97R,NHS EAST SUSSEX CCG,Urgent,Surgery - Not Otherwise Specified,Other_clinic,4,2020-12-21,,
696705,97R,NHS EAST SUSSEX CCG,Urgent,Surgery - Vascular,Other_clinic,6,2020-12-21,,
696706,97R,NHS EAST SUSSEX CCG,Urgent,Urology,Other_clinic,15,2020-12-21,,


In [268]:
# ccgs that did not merge kept the same id as before
df['CCG20CDH'] = df['CCG20CDH'].fillna(df['ccg_code'])
df = df.drop(columns=['ccg_code', 'CCG19CDH'])
df = df.rename(columns={'CCG20CDH': 'ccg_code'})
df

Unnamed: 0,ccg_name,priority,specialty,clinic_type,referrals,week_start,ccg_code
0,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Breast,10,2019-10-07,16C
1,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Gynaecology,8,2019-10-07,16C
2,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Haematology,1,2019-10-07,16C
3,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Head and Neck,2,2019-10-07,16C
4,NHS DARLINGTON CCG,2 Week Wait,2WW,2WW Lower GI,22,2019-10-07,16C
...,...,...,...,...,...,...,...
696703,NHS EAST SUSSEX CCG,Urgent,Sleep Medicine,Other_clinic,1,2020-12-21,97R
696704,NHS EAST SUSSEX CCG,Urgent,Surgery - Not Otherwise Specified,Other_clinic,4,2020-12-21,97R
696705,NHS EAST SUSSEX CCG,Urgent,Surgery - Vascular,Other_clinic,6,2020-12-21,97R
696706,NHS EAST SUSSEX CCG,Urgent,Urology,Other_clinic,15,2020-12-21,97R


In [269]:
df['ccg_code'].nunique()


135

### Eliminate clinic_type

In [270]:
df = df.groupby(['week_start', 'ccg_code', 'ccg_name', 'specialty', 'priority'])['referrals'].sum().reset_index()
df

Unnamed: 0,week_start,ccg_code,ccg_name,specialty,priority,referrals
0,2019-10-07,00L,NHS NORTHUMBERLAND CCG,(blank),Routine,13
1,2019-10-07,00L,NHS NORTHUMBERLAND CCG,(blank),Urgent,1
2,2019-10-07,00L,NHS NORTHUMBERLAND CCG,2WW,2 Week Wait,349
3,2019-10-07,00L,NHS NORTHUMBERLAND CCG,Allergy,Routine,3
4,2019-10-07,00L,NHS NORTHUMBERLAND CCG,Cardiology,Routine,84
...,...,...,...,...,...,...
592679,2020-12-21,99M,NHS NORTH EAST HAMPSHIRE AND FARNHAM CCG,Surgery - Not Otherwise Specified,Urgent,2
592680,2020-12-21,99M,NHS NORTH EAST HAMPSHIRE AND FARNHAM CCG,Surgery - Vascular,Routine,2
592681,2020-12-21,99M,NHS NORTH EAST HAMPSHIRE AND FARNHAM CCG,Surgery - Vascular,Urgent,2
592682,2020-12-21,99M,NHS NORTH EAST HAMPSHIRE AND FARNHAM CCG,Urology,Routine,25


### Remove ccg_name

In [271]:
df = df.drop(columns=['ccg_name'])
df

Unnamed: 0,week_start,ccg_code,specialty,priority,referrals
0,2019-10-07,00L,(blank),Routine,13
1,2019-10-07,00L,(blank),Urgent,1
2,2019-10-07,00L,2WW,2 Week Wait,349
3,2019-10-07,00L,Allergy,Routine,3
4,2019-10-07,00L,Cardiology,Routine,84
...,...,...,...,...,...
592679,2020-12-21,99M,Surgery - Not Otherwise Specified,Urgent,2
592680,2020-12-21,99M,Surgery - Vascular,Routine,2
592681,2020-12-21,99M,Surgery - Vascular,Urgent,2
592682,2020-12-21,99M,Urology,Routine,25


### Save

In [272]:
df.to_csv(os.path.join(OUT_DIR, "data", "referrals_oct19_dec20.csv"), index=False)
