## Overview

#### This note book is a pipline connecting to the git hub repository: https://github.com/dsfsi/covid19africa and extrating the following data related to the corona virus

1. Number of tests performed daily 
2. The Provincial case breakdown for South Africa

### Licence

\https://github.com/dsfsi/covid19za/blob/master/data/LICENSE.md

#### Import Database Structure
This is the SQL database structure used


In [27]:
from database import model as m
from datetime import datetime, timedelta
from sqlalchemy import desc
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd
import boto3
import base64
from botocore.exceptions import ClientError
import json

In [11]:
def clean_tests(df):

    def error(x):
        if pd.isnull(x):
            return 0
        else:
            return x
        
    df["date"] = df["date"].apply(lambda x:pd.to_datetime(x,dayfirst= True))
    df["cumulative_tests"] = df.cumulative_tests.apply(lambda x: error(x))
    
    return df

In [12]:
def clean_provincial_data(df):

    def error(x):
        if pd.isnull(x):
            return 0
        else:
            return x

    df = df.melt(id_vars = ["date","YYYYMMDD"])
    df["date"] = df["date"].apply(lambda x: pd.to_datetime(x,dayfirst= True))
    df = df[df.variable != "total"]
    df["value"] = df.value.apply(lambda x: error(x))
    
    return df


In [13]:
def get_max_date(Table):
    
    max_date = session.query(Table).order_by(desc('date')).first()
    session.commit()
    
    return max_date.date

In [14]:
def delete_old_data(max_date,Table):

        session.query(Table).filter(Table.date >=  (max_date  - timedelta(days=4))).delete()
        session.commit()

In [15]:
def filter_new_data(df,max_date):
    
    df =df[df["date"] >= max_date - timedelta(days=4)].reset_index()
    
    return df    

In [16]:
def test_upload(upload_country,df):
    
    
    for i in range(len(df)):
    
        ## Check Country exisits
        Country = session.query(m.Country).filter(m.Country.country == upload_country).first()
        if Country is None:
            Print("Country Not Found")
            


        Tests = m.Tests( 
                            date = df["date"][i],
                            cumulative_tests = int(df["cumulative_tests"][i]),
                            country_id = Country.id
        )
    
        session.add(Tests)

    session.commit()
    session.close()

In [17]:
def provincial_upload(df,upload_country,location_level):
    
    Upload_Country_id = session.query(m.Country).filter(m.Country.country == upload_country).first().id

    for i in range(len(df)):

        ## Check Country exisits
        Location = session.query(m.Location).filter(m.Location.location == df["variable"][i]).first()
        if Location is None:
            Location = m.Location(country_id = Upload_Country_id,
                                  location = df["variable"][i],
                                  location_level = location_level)
            session.add(Location)
            session.commit()


        CasesLocal = m.CasesLocal( 
                            date = df["date"][i],
                            confirmed = int(df["value"][i]),
                            location_id = Location.id
        )

        session.add(CasesLocal)

    session.commit()
    session.close()
    

In [32]:
def get_secret():
    secret_name = "SecretCorona"
    region_name = "eu-west-1"
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    # In this sample we only handle the specific exceptions for the 'GetSecretValue' API.
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    # We rethrow the exception by default.
    get_secret_value_response = client.get_secret_value(
        SecretId=secret_name
    )
    return get_secret_value_response

secret = json.loads(get_secret()["SecretString"])

engine = create_engine(
    'mssql+pymssql://' +
    secret['username'] + ':' + secret['password'] + '@' + secret['host'] + ':' +
    str(secret['port']) + '/Corona'

)

session = sessionmaker()(bind=engine)

In [33]:
def main():



    
    
    df_tests = pd.read_csv("https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv")
    df_provices = pd.read_csv("https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv")
    
    df_tests= clean_tests(df_tests)
    df_provices = clean_provincial_data(df_provices)
    
    max_date_tests = get_max_date(m.Tests)
    max_date_cases_local = get_max_date(m.CasesLocal)
    
    df_provices = filter_new_data(df_provices,max_date_tests)
    df_tests = filter_new_data(df_tests,max_date_tests)
    
    delete_old_data(max_date_tests,m.Tests)
    delete_old_data(max_date_cases_local,m.CasesLocal)
    
    test_upload("South Africa",df_tests)
    provincial_upload(df_provices,"South Africa","Provincial")

    

    
    
    
    
    

In [34]:
main()

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  This is separate from the ipykernel package so we can avoid doing imports until
