## Obtaining COVID data from UKHSA API

### Imports

#### Here I define the imports used in the project, time is used as during initial requests I was throttling the server:

In [1]:
import requests as req
import pandas as pd
import os
from dotenv import load_dotenv
import mysql.connector
from sqlalchemy import create_engine
import time 

## The HSA database is built upon a REST API, one can retrieve the subset of available endpoints further on in the chain of the API request

#### Firstly I get the geography types available for COVID-19 data, storing the geography types in a list:

In [2]:
def get_geography_types():
    geography_type_url= 'https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/COVID-19/geography_types'

    geography_type_request = req.get(geography_type_url)
    geography_data = geography_type_request.json()
    
    geography_types_list = []
    
    for geography_datum in geography_data:
        geography_types_list.append(geography_datum['name'])
        
    return geography_types_list

geography_types = get_geography_types()

## For each geography_type I call the API to retrieve the respective geographies, storing the results in a list:

In [3]:
def get_geographies():
    
    geographies_list = []
    
    for geography_type_name in geography_types:
        geographies_url= 'https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/COVID-19/geography_types/' + geography_type_name + '/geographies'
        
        geographies_list = []
        
        geographies_request = req.get(geographies_url)
    
        for geographies_datum in geographies_request.json():
            geographies_list.append(geographies_datum['name'])
        
        geographies_list.append(geographies_list)
        
    return geographies_list

geographies = get_geographies()

#### I get the list of available metrics for each geography within each geography type - storing the results as a list:

In [4]:
metric_for_geographies = []

for i, geography_type in enumerate(geography_types):
    unique_metrics = set()

    for geography in geographies[i]:
        url = (
            'https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/'
            'sub_themes/respiratory/topics/COVID-19/geography_types/'
            + geography_type
            + f'/geographies/{geography}/metrics'
        )

        request = req.get(url)
        time.sleep(0.5)  

        for item in request.json():
            unique_metrics.add(item['name'])

    metric_for_geographies.append(list(unique_metrics))

## Storing the data

#### The above lists are iterated over, in order to form a complete url which is used to obtain the available data - storing the output to an SQL database using sql alchemy.

### First I set up the database (if it is not in existence):

#### Loading the environment variables:


In [5]:
load_dotenv() 

True

#### Assign the environment variables to variables for use in database connection:


In [6]:
host = os.getenv('SQL-HOST')
user = os.getenv('SQL-USER')
password = os.getenv('SQL-PASSWORD')
sql_database = 'Ukhsa_covid_data'


#### Create database if it does not exist:

In [7]:
connection = mysql.connector.connect(
    host=host,
    user=user,
    password=password
)

cur = connection.cursor()

cur.execute(f"CREATE DATABASE IF NOT EXISTS {sql_database}")

connection.commit()

cur.close()
connection.close()

#### Create SQL Alchemy Connection

In [8]:
engine = create_engine(f'mysql+mysqlconnector://{user}:{password}@{host}/{sql_database}')

#### Iterating over metrics for each geography (within difference geography types) to obtain a list of urls for each metric, creating a list of associated table names, accounting for the limit of table name length in mysql by replacing lower_tier_local_authority and upper_tier_local_authority with initialisms.

In [223]:
urls = []

parameters = []

for i, metrics in enumerate(metric_for_geographies):
    metric_per_geography_urls = []
    for metric in metrics:
        urls_list = []
        formatted_geography_type = geography_types[i].lower().replace(' ', '_')
        if formatted_geography_type == 'lower_tier_local_authority':
            formatted_geography_type = 'ltla'
        elif formatted_geography_type == 'upper_tier_local_authority':
            formatted_geography_type = 'utla'
        parameters.append(f'{formatted_geography_type}_{metric.lower().replace(" ", "_")}')
        for geography in geographies[i]:
            url = f'https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/COVID-19/geography_types/{geography_types[i]}/geographies/{geography}/metrics/{metric}'
            
            urls_list.append(url)
        
        urls.append(urls_list)

### Exploring the urls contained within the first metric

In [226]:
urls[0]

['https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/COVID-19/geography_types/Government Office Region/geographies/East Midlands/metrics/COVID-19_deaths_ONSByDay',
 'https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/COVID-19/geography_types/Government Office Region/geographies/East of England/metrics/COVID-19_deaths_ONSByDay',
 'https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/COVID-19/geography_types/Government Office Region/geographies/London/metrics/COVID-19_deaths_ONSByDay',
 'https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/COVID-19/geography_types/Government Office Region/geographies/North East/metrics/COVID-19_deaths_ONSByDay',
 'https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/COVID-19/geography_types/Government Office Region/geographies/North West/m

### Testing out obtaining the data for the first metric from all regions, accounting for pagination of the API 

#### Note: the request for one parameter can take an incredibly long time as the API shuts off if too many requests are sent to it per second

#### I first create a dataframe for all the data available, accounting for pagination and concatenating data for the pagination and region

#### Note: I have commented out the following code to show my process when working out how to structure my query and store the data

In [236]:
parameter = parameters[0]

url_final_dataframe = pd.DataFrame()

for url in urls[0]:
    request = req.get(url)
    time.sleep(0.5)

    if request.status_code == 200:
        url_dataframe = pd.DataFrame(request.json()['results'])

        while request.json()['next'] is not None:
            url = request.json()['next']
            request = req.get(url)
            new_data_frame = pd.DataFrame(request.json()['results'])
            url_dataframe = pd.concat([url_dataframe, new_data_frame])
            time.sleep(0.5)

        url_final_dataframe = pd.concat([url_final_dataframe, url_dataframe])

    else:
        print(request.status_code)
        print(request.text)

url_final_dataframe


{"count":1444,"next":"https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/COVID-19/geography_types/Government%20Office%20Region/geographies/East%20Midlands/metrics/COVID-19_deaths_ONSByDay?page=2","previous":null,"results":[{"theme":"infectious_disease","sub_theme":"respiratory","topic":"COVID-19","geography_type":"Government Office Region","geography":"East Midlands","geography_code":"E12000004","metric":"COVID-19_deaths_ONSByDay","metric_group":"deaths","stratum":"default","sex":"all","age":"all","year":2020,"month":1,"epiweek":5,"date":"2020-01-30","metric_value":0.0,"in_reporting_delay_period":false},{"theme":"infectious_disease","sub_theme":"respiratory","topic":"COVID-19","geography_type":"Government Office Region","geography":"East Midlands","geography_code":"E12000004","metric":"COVID-19_deaths_ONSByDay","metric_group":"deaths","stratum":"default","sex":"all","age":"all","year":2020,"month":1,"epiweek":5,"date":"2020-01-31","metric_va

#### Exploring the final dataframe for that url group:

In [244]:
# url_final_dataframe

Unnamed: 0,theme,sub_theme,topic,geography_type,geography,geography_code,metric,metric_group,stratum,sex,age,year,month,epiweek,date,metric_value,in_reporting_delay_period
0,infectious_disease,respiratory,COVID-19,Government Office Region,East Midlands,E12000004,COVID-19_deaths_ONSByDay,deaths,default,all,all,2020,1,5,2020-01-30,0.0,False
1,infectious_disease,respiratory,COVID-19,Government Office Region,East Midlands,E12000004,COVID-19_deaths_ONSByDay,deaths,default,all,all,2020,1,5,2020-01-31,0.0,False
2,infectious_disease,respiratory,COVID-19,Government Office Region,East Midlands,E12000004,COVID-19_deaths_ONSByDay,deaths,default,all,all,2020,2,5,2020-02-01,0.0,False
3,infectious_disease,respiratory,COVID-19,Government Office Region,East Midlands,E12000004,COVID-19_deaths_ONSByDay,deaths,default,all,all,2020,2,5,2020-02-02,0.0,False
4,infectious_disease,respiratory,COVID-19,Government Office Region,East Midlands,E12000004,COVID-19_deaths_ONSByDay,deaths,default,all,all,2020,2,6,2020-02-03,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,infectious_disease,respiratory,COVID-19,Government Office Region,Yorkshire and The Humber,E12000003,COVID-19_deaths_ONSByDay,deaths,default,all,all,2024,1,2,2024-01-08,1.0,False
0,infectious_disease,respiratory,COVID-19,Government Office Region,Yorkshire and The Humber,E12000003,COVID-19_deaths_ONSByDay,deaths,default,all,all,2024,1,2,2024-01-09,2.0,False
1,infectious_disease,respiratory,COVID-19,Government Office Region,Yorkshire and The Humber,E12000003,COVID-19_deaths_ONSByDay,deaths,default,all,all,2024,1,2,2024-01-10,1.0,False
2,infectious_disease,respiratory,COVID-19,Government Office Region,Yorkshire and The Humber,E12000003,COVID-19_deaths_ONSByDay,deaths,default,all,all,2024,1,2,2024-01-11,0.0,False


#### Writing the dataframe to mysql

In [240]:
# url_final_dataframe.to_sql(name=parameters[0].replace('-', '_'), con=engine, if_exists='replace', index=False)

12996

In [None]:
url_final_dataframe.to_csv(r'../4_integrated_csv_files/government_office_region_covid_19_deaths_onsbyday.csv', index=False)

## Once, I confirmed that the data has been successfully written to MySQL database I then produced a script to iterate over the entire list of parameters to create a database for each parameter within the API - there is so much data - this takes hours to run to get all the data from the API

## The below code was not needed as based on the analysis we only want the morbidity data for the region. I therefore save the region data directly to csv for cleaning. However, I leave it in to show how I would scrape an API to get all the data and save data from the API to a sql database.

In [None]:
# for i, parameter in enumerate(parameters):
#     database_name = parameters[i].replace('-', '_')
#     url_final_dataframe = pd.DataFrame()
    
#     for url in urls[i]:
#         request = req.get(url)
#         time.sleep(0.5)
    
#         if request.status_code == 200:
#             url_dataframe = pd.DataFrame(request.json()['results'])
        
#             while request.json()['next'] is not None:
#                 url = request.json()['next']
#                 request = req.get(url)
#                 new_data_frame = pd.DataFrame(request.json()['results'])
#                 url_dataframe = pd.concat([url_dataframe, new_data_frame])
#                 time.sleep(0.5)
            
#             url_final_dataframe = pd.concat([url_final_dataframe, url_dataframe])
            
#         else:
#             print(request.status_code)
#             print(request.text)
        
#     url_final_dataframe.to_sql(name=database_name, con=engine, if_exists='replace', index=False)