### Fetch daily reports data from Github and update Bigquery table
- Github repository https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
- John Hopkins dashboard https://www.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6
- Always the fetch the lates csv (yesterday)

In [None]:
from datetime import timedelta, date, datetime
from collections import Counter
import pandas as pd, numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import storage

### First setup bigquery credential
https://googleapis.dev/python/bigquery/latest/index.html
- Create a google project
- Create a bigquery dataset in this project
- Create table called "time_series"

In [None]:
cred_json = 'Directory of google api credential json'
project_id = 'Google project id'
project_name = 'Google project name'
dataset = 'name of the corresponding Bigquery dataset'
credentials = service_account.Credentials.from_service_account_file(cred_json)
client = bigquery.Client(project = project_id,credentials = credentials)

### Fetch time series data and  process it

In [None]:
base_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'
df_confirmed = pd.read_csv(base_url+'time_series_covid19_confirmed_global.csv')
df_death = pd.read_csv(base_url+'time_series_covid19_deaths_global.csv')
df_recovered = pd.read_csv(base_url + 'time_series_covid19_recovered_global.csv')

In [None]:
# Concat the data
df_confirmed['Status']='Confirmed'
df_death['Status']='Deaths'
df_recovered['Status']='Recovered'
df = pd.concat([df_confirmed, df_death, df_recovered],axis=0)
# Check duplicates
dg=df.groupby(['Country/Region', 'Province/State','Status'])
print(len(df), len(dg))

### Query the last update in the bigquery table

In [None]:
QUERY ="SELECT * from "+project_name+'.'+"time_series ORDER BY Last_Update DESC LIMIT 1"
query_job = client.query(QUERY)
df_last = query_job.to_dataframe()
df_last

In [None]:
# Create list of dates to be updated
start_dt = df_last['Date'].iloc[0]+timedelta(1)
end_dt = date.today()-timedelta(1)
list_dates = []
for dt in pd.date_range(start_dt, end_dt):
    list_dates.append(dt.strftime("%-m/%-d/%y"))
list_dates    

In [None]:
isupdated = True
if len(list_dates)>0:
    isupdated = False
    records = []
    for index, row in df.iterrows():
        left = row[['Province/State','Country/Region','Lat','Long','Status']]
        for day in list_dates:
            record = {'Date': datetime.strptime(day,"%m/%d/%y"),
                 'Number':row[day]}
        
            record = pd.DataFrame([{**left, **record}])
            records.append(record)
    dh = pd.concat(records, axis=0) 
    # remove empty records
    dh['empty']=dh.Number.isna()
    dh = dh[dh['empty']==False]
    dh = dh.drop(columns=['empty'])
else:
    print('Time series table already up to date')

### Append new data to Bigquery table

In [None]:
table_id =project_id+'.'+dataset+'.time_series'
table = client.get_table(table_id) 

In [None]:
if not isupdated>0:
    dh = dh.astype({'Date':str,'Number':float})
    dh['Province/State']=dh['Province/State'].apply(lambda x: x if isinstance(x,str) else None)
    rows_to_insert = []
    for i in range(len(dh)):
        rows_to_insert.append(tuple([x for x in dh.iloc[i].to_numpy()]))

    errors = client.insert_rows(table, rows_to_insert)  # Make an API request.
    if errors == []:
        print("New rows have been added.")