### Country Vx Throughput Analysis - Data
 
**Note:** Gets data for the WHO/BMGF/Gavi Vx Throughput task force

* Source:

* Libraries: 
  - Python

* Built by: Jeremy Cooper
* Current owner: Jeremy Cooper
* Initial Build Date: 06/14/2021
* Latest Build Date: 06/14/2021

### Environment Management

In [0]:
# dbutils.widgets.removeAll()

In [0]:
# # Dataset Name, will be used for the Metastore Table, Folder Name for transformed outputs
# dbutils.widgets.text("Dataset", "dataset_name")

# # Project Name will be used for folder Name for transformed outputs
# dbutils.widgets.text("Project", "project_name")

# # Team name should be consistent with the Blob Storage Container
# dbutils.widgets.text("Partner","partner_name")

# # Team name should be consistent with the Blob Storage Container
# dbutils.widgets.text("Source","data_source")

# dbutils.widgets.text("iso_code", "")

#### Notebook Setup

##### Import any libraries or nested notebooks

In [0]:
from urllib.request import Request, urlopen
import urllib
import json
import pandas as pd

from delta.tables import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

##### Initialize File Paths

In [0]:
storage_root = "/mnt/"+dbutils.widgets.get("Partner")+"/"
storage_branch = "/"+dbutils.widgets.get("Source")+"/" +dbutils.widgets.get("Dataset")

raw_storage_path = storage_root + "raw" +storage_branch
dbfs_raw_storage_path = "/dbfs"+raw_storage_path

transformed_storage_path = storage_root + "transformed" +storage_branch
dbfs_transformed_storage_path = "/dbfs"+transformed_storage_path

print(raw_storage_path)
print(transformed_storage_path)

### Get Data

In [0]:
url1 = 'https://frontdoor-l4uikgap6gz3m.azurefd.net/NCOV/VAC_REP_COUNTS'
url2 = 'https://frontdoor-l4uikgap6gz3m.azurefd.net/NCOV/VAC_REP_COUNTS_EUR'

In [0]:
# 2 data sources for Throughput, one for Europe and one for ROW
# read in separately, clean, and then bind together

response = urlopen(url1)
data_json = json.loads(response.read())
df1 = pd.DataFrame(data_json["value"])
df1 = df1[['COUNTRY_FK', 'AS_OF_DATE', 'DOSES_ADMINISTERED', 'PERSONS_VACCINATED_ONE_PLUS_DOSE', 'PERSONS_VACCINATED_FULL', 'PERSONS_BOOSTER_ADD_DOSE', 'SOURCE']]
df1 = df1[df1['COUNTRY_FK']!='None']
df1.columns = ['entity', 'date', 'total_doses', 'at_least_one_dose', 'fully_vaccinated', 'persons_booster_add_dose', 'source']
df1['country_name'] = df1['entity'].str.title()
df1 = spark.createDataFrame(df1).drop_duplicates()

In [0]:
from urllib.error import HTTPError
try:
  response = urlopen(url2)
  data_json = json.loads(response.read())
except HTTPError as e:
  data_json = json.loads(e.read())
df2 = pd.DataFrame(data_json["value"])
df2 = df2.astype(str)
df2 = df2[['COUNTRY_FK', 'AS_OF_DATE', 'DOSES_ADMINISTERED', 'PERSONS_VACCINATED_ONE_PLUS_DOSE', 'PERSONS_VACCINATED_FULL', 'PERSONS_BOOSTER_ADD_DOSE', 'SOURCE']]
df2 = df2[df2['COUNTRY_FK']!='None']
df2.columns = ['entity', 'date', 'total_doses', 'at_least_one_dose', 'fully_vaccinated', 'persons_booster_add_dose', 'source']
df2['country_name'] = df2['entity'].str.title()
df2 = spark.createDataFrame(df2).drop_duplicates()

df3 = df1.union(df2)

### Transformation

In [0]:
# define schema for dataframe
schema1 = StructType([
  StructField("entity", StringType()),
  StructField("date", StringType()),
  StructField("total_doses", StringType()),
  StructField("at_least_one_dose", StringType()),
  StructField("fully_vaccinated", StringType()),
  StructField("persons_booster_add_dose", StringType()),
  StructField("source", StringType()),
  StructField("country_name", StringType()),
])

In [0]:
df_data = spark.createDataFrame(df3.rdd, schema=schema1) \
  .drop_duplicates()

# clean Excel columns, replace None and nan
col_names = df_data.schema.names
for name in col_names:
    df_data = df_data.withColumn(name, when(~col(name).isin(['None', 'nan']), trim(col(name))).otherwise(None)) \

# manaully fix some data date issues
df_data = df_data \
  .withColumn('date', when((col('entity')=='ANGUILLA') & (col('date')=='2001-02-26'), '2021-02-26') \
              .when((col('entity')=='TRINIDAD AND TOBAGO') & (col('date')=='2001-02-26'), '2021-02-26') \
              .when((col('entity')=='PAKISTAN') & (col('date')=='2010-03-05'), '2021-03-05') \
              .when((col('entity')=='CANADA') & (col('date')=='2020-01-29'), '2021-01-29') \
              .when((col('entity')=='LIBYA') & (col('date')=='1900-05-18'), '2021-05-18') \
              .when((col('entity')=='LIBYA') & (col('date')=='1900-05-20'), '2021-05-20') \
              .when((col('entity')=='LIBYA') & (col('date')=='1900-05-25'), '2021-05-25') \
              .when((col('entity')=='LIBYA') & (col('date')=='1900-05-26'), '2021-05-26') \
              .when((col('entity')=='BRUNEI DARUSSALAM') & (col('date')=='2021-03-20'), '2021-04-20') \
              .when((col('entity')=='LIBYA') & (col('date')=='2021-01-08'), '2022-01-08') \
              .when((col('entity')=='MOROCCO') & (col('date')=='2021-01-04'), '2022-01-04') \
              .when((col('entity')=='MOROCCO') & (col('date')=='2021-01-08'), '2022-01-08') \
              .otherwise(col('date')))

# date stamp dataset
df_data = df_data.withColumn("date_accessed", current_date())

display(df_data.orderBy(col('date').asc()))

entity,date,total_doses,at_least_one_dose,fully_vaccinated,persons_booster_add_dose,source,country_name,date_accessed
CHINA,2020-12-15,1500000.0,,,,,China,2022-05-09
SINGAPORE,2021-01-11,3400.0,,,,,Singapore,2022-05-09
MONACO,2021-01-18,2400.0,,,,Other sources,Monaco,2022-05-09
CHILE,2021-01-22,63047.0,54683.0,8364.0,,Ministerio de Salud.,Chile,2022-05-09
UNITED STATES OF AMERICA,2021-01-22,19000046.0,16243093.0,2756953.0,,https://covid.cdc.gov/covid-data-tracker/#vaccinations,United States Of America,2022-05-09
COSTA RICA,2021-01-22,29257.0,29125.0,132.0,,EPI Manager,Costa Rica,2022-05-09
ARGENTINA,2021-01-22,279602.0,254456.0,25146.0,,Ministerio de Salud.,Argentina,2022-05-09
CANADA,2021-01-22,769092.0,695746.0,73346.0,,https://covid19tracker.ca/vaccinationtracker.html,Canada,2022-05-09
BRAZIL,2021-01-22,193699.0,193699.0,,,https://www.gov.br/saude/pt-br https://coronavirusbra1.github.io/ https://graphics.reuters.com/world-coronavirus-tracker-and-maps/countries-and-territories/brazil/,Brazil,2022-05-09
MEXICO,2021-01-22,552335.0,534317.0,18018.0,,https://www.gob.mx/salud/prensa/269-continua-jornada-de-vacunacion-contra-covid-19-en-mexico?idiom=es https://www.gob.mx/salud/prensa/version-estenografica-conferencia-de-prensa-informe-diario-sobre-coronavirus-covid-19-en-mexico-261905,Mexico,2022-05-09


### Save to Azure Storage / Register in Databricks metastore

In [0]:
delta_path = transformed_storage_path + '.delta'

# dbutils.fs.rm(delta_path, True)

df_data.write.format("delta").mode("overwrite").save(delta_path)

In [0]:
# path for delta
print(transformed_storage_path + '.delta')

In [0]:
%sql

DROP TABLE IF EXISTS covax_supply_chain_analytics.analysis_vx_throughput_data;

CREATE TABLE covax_supply_chain_analytics.analysis_vx_throughput_data
USING DELTA
LOCATION '/mnt/covax-supply-chain-analytics/transformed/who/analysis_vx_throughput_data.delta'

In [0]:
display(spark.sql("SELECT * FROM covax_supply_chain_analytics.analysis_vx_throughput_data"))

entity,date,total_doses,at_least_one_dose,fully_vaccinated,persons_booster_add_dose,source,country_name,date_accessed
SERBIA,2021-03-24,2241510,1338010.0,,,Other sources,Serbia,2022-05-09
NETHERLANDS,2022-01-09,31341750,13338035.0,11405719.0,,Reported to WHO,Netherlands,2022-05-09
KYRGYZSTAN,2022-02-27,2535119,1337815.0,1126345.0,70959.0,Reported to WHO,Kyrgyzstan,2022-05-09
CYPRUS,2022-03-20,1730017,657318.0,656339.0,440227.0,Reported to WHO,Cyprus,2022-05-09
GEORGIA,2022-03-06,2617506,1264008.0,1146994.0,206504.0,Reported to WHO,Georgia,2022-05-09
IRELAND,2021-11-21,7977391,3847062.0,3544763.0,,Reported to WHO,Ireland,2022-05-09
CROATIA,2022-01-02,4733233,2258321.0,1954253.0,,Reported to WHO,Croatia,2022-05-09
NORTH MACEDONIA,2021-02-16,940,940.0,0.0,,Reported to WHO,North Macedonia,2022-05-09
MALTA,2021-05-02,320764,209610.0,111154.0,,Reported to WHO,Malta,2022-05-09
IRELAND,2021-05-16,1970088,1436255.0,533833.0,,Reported to WHO,Ireland,2022-05-09


##### Query Delta Log

In [0]:
display(
  spark.sql("DESCRIBE HISTORY delta. `/mnt/covax-supply-chain-analytics/transformed/who/analysis_vx_throughput_data.delta`")
)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata
26,2022-05-09T17:15:13.000+0000,6136552160696939,jeremy.cooper@gatesfoundation.org,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(1884626790114412),1112-212424-shuwbub0,25,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 13158, numOutputBytes -> 381920)",
25,2022-05-03T17:01:27.000+0000,6136552160696939,jeremy.cooper@gatesfoundation.org,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(1884626790114412),1112-212424-shuwbub0,24,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 13000, numOutputBytes -> 376656)",
24,2022-05-03T16:02:37.000+0000,6136552160696939,jeremy.cooper@gatesfoundation.org,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(1884626790114412),1112-212424-shuwbub0,23,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 13000, numOutputBytes -> 375684)",
23,2022-04-26T03:04:12.000+0000,6136552160696939,jeremy.cooper@gatesfoundation.org,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(1884626790114412),1112-212424-shuwbub0,22,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 12606, numOutputBytes -> 364355)",
22,2022-04-18T21:50:05.000+0000,6136552160696939,jeremy.cooper@gatesfoundation.org,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(1884626790114412),1112-212424-shuwbub0,21,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 12493, numOutputBytes -> 359528)",
21,2022-04-11T21:19:56.000+0000,6136552160696939,jeremy.cooper@gatesfoundation.org,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(1884626790114412),1112-212424-shuwbub0,20,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 12310, numOutputBytes -> 352983)",
20,2022-04-04T19:52:05.000+0000,6136552160696939,jeremy.cooper@gatesfoundation.org,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(1884626790114412),1112-212424-shuwbub0,19,WriteSerializable,False,"Map(numFiles -> 1, numOutputBytes -> 342926, numOutputRows -> 12062)",
19,2022-03-28T20:46:26.000+0000,6136552160696939,jeremy.cooper@gatesfoundation.org,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(1884626790114412),1112-212424-shuwbub0,18,WriteSerializable,False,"Map(numFiles -> 1, numOutputBytes -> 343759, numOutputRows -> 12053)",
18,2022-03-21T17:44:30.000+0000,6136552160696939,jeremy.cooper@gatesfoundation.org,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(1884626790114412),1112-212424-shuwbub0,17,WriteSerializable,False,"Map(numFiles -> 1, numOutputBytes -> 332766, numOutputRows -> 11774)",
17,2022-03-14T23:58:45.000+0000,6136552160696939,jeremy.cooper@gatesfoundation.org,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(1884626790114412),1112-212424-shuwbub0,16,WriteSerializable,False,"Map(numFiles -> 1, numOutputBytes -> 320596, numOutputRows -> 11442)",


### Appendix