# UKHSA Vaccination Data (https://ukhsa-dashboard.data.gov.uk/covid-19-archive-data-download)
## Data Extraction and Creation of Data Structure for Final Cleaning

### Imports

In [1]:
import pandas as pd
from pathlib import Path
import re
import os
from IPython.display import display, HTML 

### Function to stratify all available files/metrics

#### From looking at the directory there are 519 files spanning 3 years!

##### I need to come up with a way of grouping by the metric and then combining the dataset across the available years so that we can further analyse the data I manually merged all of the data from the zip online (https://ukhsa-dashboard.data.gov.uk/covid-19-archive-data-download) and restructured it for easier sorting/searching/combining

In [2]:
def split_by_case(word):
    return re.findall(r'[A-Z][a-z]*', word)

In [3]:
path_for_vaccine_datasets = r'../1_raw_files/ukhsa_vaccine_data/spreadsheets'

directory_path = Path(path_for_vaccine_datasets)

if os.path.isdir(directory_path):
        if os.listdir(directory_path):

            file_list = [f.name for f in directory_path.iterdir() if f.is_file()]
            
            file_paths = [f'{path_for_vaccine_datasets}/{file}' for file in file_list if file[-4:] == '.csv']
            
            available_data = [] 
            
            for i, file in enumerate(file_list):
                if file[:-9] not in available_data:
                    available_data.append(file[:-9])
                    
            metrics = []       
            
            for data in available_data:
                if data not in metrics:
                    ending = data.split('_')[1]
                    age = "" 
                    if "75plus" in data or "65plus" in data:
                        index = data.find("plus") - 2
                        age = data[index:index+2] + " " + data[index+2:index+6] + " "
                    data = data[0].upper() + data[1:]
                    words = split_by_case(data)
                    if words[0] == 'Cum':
                        words[0] = 'Cumulative'
                    metric = ' '.join(map(str, words)) + " " + age + ending.capitalize()
                    metrics.append(metric)
            
            metrics_dictionary = dict(zip(metrics, available_data))
            
            metrics_dictionary = dict(sorted(metrics_dictionary.items()))
        else:
                google_drive_url = 'https://drive.google.com/file/d/1ata9dPfKjFPwAGGjA95-4aSEpvLctRfH/view?usp=sharing'
                display(HTML(f'<a href="{google_drive_url}" target="_blank">Click here to download the vaccination data from Google Drive</a>'))
                print("Click the above link to download the data from Google Drive and save in the directory: 1_raw_files/ukhsa_cases_data")

### List of all metrics available

In [4]:
for i, key in enumerate(metrics_dictionary.keys()):
    print(f"{i+1}: {key}. ")

1: Cumulative People Vaccinated Autumn By Vaccination Date 65 plus Ltla. 
2: Cumulative People Vaccinated Autumn By Vaccination Date 65 plus Nation. 
3: Cumulative People Vaccinated Autumn By Vaccination Date 65 plus Region. 
4: Cumulative People Vaccinated Autumn By Vaccination Date 65 plus Utla. 
5: Cumulative People Vaccinated Booster Dose By Publish Date Nation. 
6: Cumulative People Vaccinated Complete By Publish Date Ltla. 
7: Cumulative People Vaccinated Complete By Publish Date Nation. 
8: Cumulative People Vaccinated Complete By Publish Date Overview. 
9: Cumulative People Vaccinated Complete By Publish Date Region. 
10: Cumulative People Vaccinated Complete By Publish Date Utla. 
11: Cumulative People Vaccinated Complete By Vaccination Date Ltla. 
12: Cumulative People Vaccinated Complete By Vaccination Date Nation. 
13: Cumulative People Vaccinated Complete By Vaccination Date Region. 
14: Cumulative People Vaccinated Complete By Vaccination Date Utla. 
15: Cumulative People

### List of the regional metrics available

In [5]:
for i, key in enumerate(metrics_dictionary.keys()):
    if "Region" in key:
        print(f"{i+1}: {key}. ")

3: Cumulative People Vaccinated Autumn By Vaccination Date 65 plus Region. 
9: Cumulative People Vaccinated Complete By Publish Date Region. 
13: Cumulative People Vaccinated Complete By Vaccination Date Region. 
18: Cumulative People Vaccinated First Dose By Publish Date Region. 
23: Cumulative People Vaccinated First Dose By Vaccination Date Region. 
28: Cumulative People Vaccinated Second Dose By Publish Date Region. 
33: Cumulative People Vaccinated Second Dose By Vaccination Date Region. 
37: Cumulative People Vaccinated Spring By Vaccination Date 75 plus Region. 
43: Cumulative People Vaccinated Third Injection By Publish Date Region. 
47: Cumulative People Vaccinated Third Injection By Vaccination Date Region. 
51: Cumulative Vaccination Autumn Uptake By Vaccination Date Percentage 65 plus Region. 
58: Cumulative Vaccination Complete Coverage By Vaccination Date Percentage Region. 
66: Cumulative Vaccination First Dose Uptake By Vaccination Date Percentage Region. 
74: Cumulativ

### List of the national metrics available

In [6]:
for i, key in enumerate(metrics_dictionary.keys()):
    if "Nation" in key:
        print(f"{i+1}: {key}. ")

2: Cumulative People Vaccinated Autumn By Vaccination Date 65 plus Nation. 
5: Cumulative People Vaccinated Booster Dose By Publish Date Nation. 
7: Cumulative People Vaccinated Complete By Publish Date Nation. 
12: Cumulative People Vaccinated Complete By Vaccination Date Nation. 
16: Cumulative People Vaccinated First Dose By Publish Date Nation. 
21: Cumulative People Vaccinated First Dose By Vaccination Date Nation. 
26: Cumulative People Vaccinated Second Dose By Publish Date Nation. 
31: Cumulative People Vaccinated Second Dose By Vaccination Date Nation. 
36: Cumulative People Vaccinated Spring By Vaccination Date 75 plus Nation. 
39: Cumulative People Vaccinated Third Dose By Publish Date Nation. 
41: Cumulative People Vaccinated Third Injection By Publish Date Nation. 
46: Cumulative People Vaccinated Third Injection By Vaccination Date Nation. 
50: Cumulative Vaccination Autumn Uptake By Vaccination Date Percentage 65 plus Nation. 
53: Cumulative Vaccination Booster Dose Upta

# I selected the following statistics from analysing the list of regional metrics above: 105, 135, 165, 199, 212, 208

### Select a statistic to combine data across years into a dataframe for further analysis/save as csv

#### Change the number below to generate a dataframe of your desired metric which can be saved as a csv

In [49]:
selected_data = 208

In [50]:
keys = list(metrics_dictionary.keys())
selected_key = keys[selected_data-1]
selected_metric = metrics_dictionary[selected_key]

In [51]:
available_files_for_metric = []

for file in file_paths:
    if selected_metric in file:
        available_files_for_metric.append(file)

In [52]:
dataframes = []

for file in available_files_for_metric:
    df = pd.read_csv(file)
    dataframes.append(df)

In [53]:
combined_df = pd.concat(dataframes, ignore_index=True)

In [54]:
combined_df

Unnamed: 0,date,area_type,area_code,area_name,metric,metric_name,age,cumPeopleVaccinatedAutumn22ByVaccinationDate,cumPeopleVaccinatedCompleteByVaccinationDate,cumPeopleVaccinatedSecondDoseByVaccinationDate,...,cumVaccinationSecondDoseUptakeByVaccinationDatePercentage,cumVaccinationSpring22UptakeByVaccinationDatePercentage,cumVaccinationThirdInjectionUptakeByVaccinationDatePercentage,newPeopleVaccinatedAutumn22ByVaccinationDate,newPeopleVaccinatedCompleteByVaccinationDate,newPeopleVaccinatedFirstDoseByVaccinationDate,newPeopleVaccinatedSecondDoseByVaccinationDate,newPeopleVaccinatedSpring22ByVaccinationDate,newPeopleVaccinatedSpring23ByVaccinationDate,VaccineRegisterPopulationByVaccinationDate
0,2022-01-01,region,E12000003,Yorkshire and The Humber,vaccinationsAgeDemographics,Vaccinations age demographics breakdown,05_11,,7.0,7.0,...,0.0,,0.0,,0,0,0,,,484231
1,2022-01-01,region,E12000003,Yorkshire and The Humber,vaccinationsAgeDemographics,Vaccinations age demographics breakdown,12_15,,3510.0,3510.0,...,1.2,,0.0,,24,29,24,,,288773
2,2022-01-01,region,E12000003,Yorkshire and The Humber,vaccinationsAgeDemographics,Vaccinations age demographics breakdown,16_17,,4875.0,4875.0,...,3.5,,0.0,,29,18,29,,,137934
3,2022-01-01,region,E12000003,Yorkshire and The Humber,vaccinationsAgeDemographics,Vaccinations age demographics breakdown,18_24,,281111.0,281111.0,...,54.8,,17.9,,69,35,69,,,513378
4,2022-01-01,region,E12000003,Yorkshire and The Humber,vaccinationsAgeDemographics,Vaccinations age demographics breakdown,25_29,,242248.0,242248.0,...,58.8,,24.3,,26,29,26,,,412127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208084,2020-12-31,region,E12000008,South East,vaccinationsAgeDemographics,Vaccinations age demographics breakdown,75+,,,,...,,,,,0,0,0,,,0
208085,2020-12-31,region,E12000008,South East,vaccinationsAgeDemographics,Vaccinations age demographics breakdown,75_79,,155.0,155.0,...,0.0,,0.0,,2,582,2,,,434632
208086,2020-12-31,region,E12000008,South East,vaccinationsAgeDemographics,Vaccinations age demographics breakdown,80_84,,121.0,121.0,...,0.0,,0.0,,23,1807,23,,,268600
208087,2020-12-31,region,E12000008,South East,vaccinationsAgeDemographics,Vaccinations age demographics breakdown,85_89,,102.0,102.0,...,0.1,,0.0,,36,1942,36,,,171018


### Save file as CSV

In [55]:
combined_df.to_csv(rf"../4_integrated_csv_files/ukhsa_vaccination_data/{selected_metric}.csv", index=False)