# UKHSA Cases Data (https://ukhsa-dashboard.data.gov.uk/covid-19-archive-data-download)
## Data Extraction and Creation of Data Structure for Final Cleaning
### Imports

In [1]:
import pandas as pd
from pathlib import Path
import re
import os
from IPython.display import display, HTML

### Function to stratify all available files/metrics

#### From looking at the directory there are 1178 files spanning 3 years!

##### I need to come up with a way of grouping by the metric and then combining the dataset across the available years so that we can further analyse the data I manually merged all of the data from the zip online (https://ukhsa-dashboard.data.gov.uk/covid-19-archive-data-download) and restructured it for easier sorting/searching/combining

In [2]:
def split_by_case(word):
    return re.findall(r'[A-Z][a-z]*', word)

In [3]:
path_for_vaccine_datasets = '1_raw_databases/ukhsa_cases_data'

directory_path = Path(path_for_vaccine_datasets)

if os.path.isdir(directory_path):
        if os.listdir(directory_path):
            file_list = [f.name for f in directory_path.iterdir() if f.is_file()]

            file_paths = [f'{path_for_vaccine_datasets}/{file}' for file in file_list if file[-4:] == '.csv']
            
            available_data = []
            
            for i, file in enumerate(file_list):
                if file[:-9] not in available_data:
                    available_data.append(file[:-9])
            
            metrics = []
            
            for data in available_data:
                if data not in metrics:
                    ending = data.split('_')[1]
                    age = ""
                    data = data[0].upper() + data[1:]
                    words = split_by_case(data)
                    if words[0] == 'Cum':
                        words[0] = 'Cumulative'
                    metric = ' '.join(map(str, words)) + " " + age + ending.capitalize()
                    metric = metric.replace("L F D", "LFD").replace("P C R", "PCR")
                    metrics.append(metric)
            
            metrics_dictionary = dict(zip(metrics, available_data))
            
            metrics_dictionary = dict(sorted(metrics_dictionary.items()))
            
        if not os.listdir(directory_path):
                google_drive_url = 'https://drive.google.com/file/d/15zuegbX43NG22VrHp8Wz6RC0ohlbZJ19/view?usp=sharing'
                display(HTML(f'<a href="{google_drive_url}" target="_blank">Click here to download the data from Google Drive</a>'))
                print("Click the above link to download the data from Google Drive and save in the directory: report/visualisation_and_analyses/1_data_cleaning_preprocessing/1_api_connectors_and_csv_parsers/1_raw_databases/ukhsa_cases_data")

### List of all metrics available

In [4]:
for i, key in enumerate(metrics_dictionary.keys()):
    print(f"{i + 1}: {key}. ")

1: Change In New Cases By Specimen Date Ltla. 
2: Change In New Cases By Specimen Date Nation. 
3: Change In New Cases By Specimen Date Region. 
4: Change In New Cases By Specimen Date Utla. 
5: Cumulative Cases By Publish Date Ltla. 
6: Cumulative Cases By Publish Date Nation. 
7: Cumulative Cases By Publish Date Overview. 
8: Cumulative Cases By Publish Date Rate Ltla. 
9: Cumulative Cases By Publish Date Rate Nation. 
10: Cumulative Cases By Publish Date Rate Overview. 
11: Cumulative Cases By Publish Date Rate Region. 
12: Cumulative Cases By Publish Date Rate Utla. 
13: Cumulative Cases By Publish Date Region. 
14: Cumulative Cases By Publish Date Utla. 
15: Cumulative Cases By Specimen Date Ltla. 
16: Cumulative Cases By Specimen Date Nation. 
17: Cumulative Cases By Specimen Date Overview. 
18: Cumulative Cases By Specimen Date Rate Ltla. 
19: Cumulative Cases By Specimen Date Rate Nation. 
20: Cumulative Cases By Specimen Date Rate Overview. 
21: Cumulative Cases By Specimen Da

### List of the regional metrics available

In [5]:
for i, key in enumerate(metrics_dictionary.keys()):
    if "Region" in key:
        print(f"{i + 1}: {key}. ")

3: Change In New Cases By Specimen Date Region. 
11: Cumulative Cases By Publish Date Rate Region. 
13: Cumulative Cases By Publish Date Region. 
21: Cumulative Cases By Specimen Date Rate Region. 
23: Cumulative Cases By Specimen Date Region. 
27: Cumulative Cases LFD Confirmed PCR By Specimen Date Region. 
31: Cumulative Cases LFD Only By Specimen Date Region. 
35: Cumulative Cases PCR Only By Specimen Date Region. 
41: Cumulative Cases Pillar One By Specimen Date Rate Region. 
43: Cumulative Cases Pillar One By Specimen Date Region. 
49: Cumulative Cases Pillar Two By Specimen Date Rate Region. 
51: Cumulative Cases Pillar Two By Specimen Date Region. 
59: Cumulative First Episodes By Specimen Date Rate Region. 
61: Cumulative First Episodes By Specimen Date Region. 
69: Cumulative Reinfections By Specimen Date Rate Region. 
71: Cumulative Reinfections By Specimen Date Region. 
74: Female Cases Region. 
76: Male Cases Region. 
83: New Cases By Publish Date Change Percentage Region. 

### List of the national metrics available

In [6]:
for i, key in enumerate(metrics_dictionary.keys()):
    if "Nation" in key:
        print(f"{i + 1}: {key}. ")

2: Change In New Cases By Specimen Date Nation. 
6: Cumulative Cases By Publish Date Nation. 
9: Cumulative Cases By Publish Date Rate Nation. 
16: Cumulative Cases By Specimen Date Nation. 
19: Cumulative Cases By Specimen Date Rate Nation. 
26: Cumulative Cases LFD Confirmed PCR By Specimen Date Nation. 
30: Cumulative Cases LFD Only By Specimen Date Nation. 
34: Cumulative Cases PCR Only By Specimen Date Nation. 
38: Cumulative Cases Pillar One By Specimen Date Nation. 
40: Cumulative Cases Pillar One By Specimen Date Rate Nation. 
46: Cumulative Cases Pillar Two By Specimen Date Nation. 
48: Cumulative Cases Pillar Two By Specimen Date Rate Nation. 
54: Cumulative First Episodes By Specimen Date Nation. 
57: Cumulative First Episodes By Specimen Date Rate Nation. 
64: Cumulative Reinfections By Specimen Date Nation. 
67: Cumulative Reinfections By Specimen Date Rate Nation. 
73: Female Cases Nation. 
75: Male Cases Nation. 
78: New Cases By Publish Date Change Nation. 
81: New Case

### Select a statistic to combine data across years into a dataframe for further analysis/save as csv

#### Change the number below to generate a dataframe of your desired metric which can be saved as a csv

In [7]:
# Metrics selected for cleaning & processing: 135, 139, 99, 149, 161, 173, 233, 267, 75

In [8]:
selected_data = 135

In [9]:
keys = list(metrics_dictionary.keys())

selected_key = keys[selected_data - 1]

selected_metric = metrics_dictionary[selected_key]

In [10]:
available_files_for_metric = []

for file in file_paths:
    if selected_metric in file:
        available_files_for_metric.append(file)

In [12]:
dataframes = []

for file in available_files_for_metric:
    print(file)
    df = pd.read_csv(file)
    dataframes.append(df)

1_raw_databases/ukhsa_cases_data/femaleCases_region_2022.csv
1_raw_databases/ukhsa_cases_data/femaleCases_region_2023.csv
1_raw_databases/ukhsa_cases_data/femaleCases_region_2021.csv
1_raw_databases/ukhsa_cases_data/femaleCases_region_2020.csv


In [14]:
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df

Unnamed: 0,date,area_type,area_code,area_name,metric,metric_name,value
0,2022-01-01,region,E12000005,West Midlands,femaleCases,Female cases,11465
1,2022-01-01,region,E12000005,West Midlands,femaleCases,Female cases,14646
2,2022-01-01,region,E12000005,West Midlands,femaleCases,Female cases,23957
3,2022-01-01,region,E12000005,West Midlands,femaleCases,Female cases,56532
4,2022-01-01,region,E12000005,West Midlands,femaleCases,Female cases,57590
...,...,...,...,...,...,...,...
220585,2020-12-31,region,E12000003,Yorkshire and The Humber,femaleCases,Female cases,12510
220586,2020-12-31,region,E12000003,Yorkshire and The Humber,femaleCases,Female cases,6825
220587,2020-12-31,region,E12000003,Yorkshire and The Humber,femaleCases,Female cases,3908
220588,2020-12-31,region,E12000003,Yorkshire and The Humber,femaleCases,Female cases,3071


### Save file as CSV

In [15]:
combined_df.to_csv(f"2_processed_databases/ukhsa_cases_data/{selected_metric}.csv", index=False)