## Overview

#### This note book is a pipline connecting to the git hub repository: https://github.com/CSSEGISandData/COVID-19 and extrating the following data related to the corona virus

1. Country Level Confirmed Cases Per Day
2. Country Level Deaths Per Day

### Terms of use:

This GitHub repo and its contents herein, including all data, mapping, and analysis, copyright 2020 Johns Hopkins University, all rights reserved, is provided to the public strictly for educational and academic research purposes. The Website relies upon publicly available data from multiple sources, that do not always agree. The Johns Hopkins University hereby disclaims any and all representations and warranties with respect to the Website, including accuracy, fitness for use, and merchantability. Reliance on the Website for medical guidance or use of the Website in commerce is strictly prohibited.

#### Import Database Structure
This is the SQL database structure used


In [2]:
from database import model as m
from database import model as m
from datetime import datetime, timedelta
from sqlalchemy import desc
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd
import boto3
import base64
from botocore.exceptions import ClientError
import json

In [3]:
def get_secret():
    secret_name = "SecretCorona"
    region_name = "eu-west-1"
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    # In this sample we only handle the specific exceptions for the 'GetSecretValue' API.
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    # We rethrow the exception by default.
    get_secret_value_response = client.get_secret_value(
        SecretId=secret_name
    )
    return get_secret_value_response

secret = json.loads(get_secret()["SecretString"])

engine = create_engine(
    'mssql+pymssql://' +
    secret['username'] + ':' + secret['password'] + '@' + secret['host'] + ':' +
    str(secret['port']) + '/Corona'

)

session = sessionmaker()(bind=engine)

##### Confirmed_Cases

In [4]:
df_C = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")

##### Deaths

In [5]:
df_D = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")

##### Recovered

In [6]:
df_R = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")

#### Clean Data

##### Clean Confirmed Cases

In [7]:
df_C = df_C.melt(id_vars = ["Country/Region","Lat","Long","Province/State"])

df_C_country_level = df_C.groupby(["Country/Region","variable"]).agg({"Lat":"mean","Long":"mean","value":"sum"})

df_C_country_level = df_C_country_level.reset_index()

df_C_country_level["date"] = df_C_country_level.variable.apply(lambda x: pd.to_datetime(x))

df_C_country_level = df_C_country_level[df_C_country_level["value"]!=0]

##### Clean Deaths

In [8]:
df_D = df_D.melt(id_vars = ["Country/Region","Lat","Long","Province/State"])

df_D_country_level = df_D.groupby(["Country/Region","variable"]).agg({"value":"sum"})

df_D_country_level = df_D_country_level.reset_index()

df_D_country_level["date"] = df_D_country_level.variable.apply(lambda x: pd.to_datetime(x))

df_D_country_level = df_D_country_level[df_D_country_level["value"]!=0]

##### Clean Recovered

In [9]:
df_R = df_R.melt(id_vars = ["Country/Region","Lat","Long","Province/State"])

df_R_country_level = df_R.groupby(["Country/Region","variable"]).agg({"value":"sum"})

df_R_country_level = df_R_country_level.reset_index()

df_R_country_level["date"] = df_R_country_level.variable.apply(lambda x: pd.to_datetime(x))

df_R_country_level = df_R_country_level[df_R_country_level["value"]!=0]

#### Merge deaths with confirmed cases with recovered

In [10]:
df_merge = df_C_country_level.merge(df_D_country_level ,on = ["Country/Region","date"],how = "left").\
merge(df_R_country_level ,on = ["Country/Region","date"],how = "left")

#### Cleaning Merged Data

In [11]:
def error(x):
    if pd.isnull(x):
        return 0
    else:
        return x
    
df_merge["deaths"] = df_merge.value_y.apply(lambda x: error(x))
df_merge["recovered"] = df_merge.value.apply(lambda x: error(x))

#### Get Max date

In [12]:
def get_max_date(Table):
    
    max_date = session.query(Table).order_by(desc('date')).first()
    session.commit()
    
    return max_date.date

max_date = get_max_date(m.CasesGlobal)

#### Delete Old Data

In [13]:
def delete_old_data(max_date,Table):

        session.query(Table).filter(Table.date >=  (max_date  - timedelta(days=3))).delete()
        session.commit()
        
delete_old_data(max_date,m.CasesGlobal)

#### Filter New data

In [14]:
def filter_new_data(df,max_date):
    
    df =df[df["date"] >= max_date - timedelta(days=3)].reset_index()
    
    return df    

df_merge = filter_new_data(df_merge,max_date)

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
df_merge

Unnamed: 0,index,Country/Region,variable_x,Lat,Long,value_x,date,variable_y,value_y,variable,value,deaths,recovered
0,38,Afghanistan,4/10/20,33.000000,65.000000,521,2020-04-10,4/10/20,15.0,4/10/20,32.0,15.0,32.0
1,39,Afghanistan,4/11/20,33.000000,65.000000,555,2020-04-11,4/11/20,18.0,4/11/20,32.0,18.0,32.0
2,40,Afghanistan,4/12/20,33.000000,65.000000,607,2020-04-12,4/12/20,18.0,4/12/20,32.0,18.0,32.0
3,41,Afghanistan,4/13/20,33.000000,65.000000,665,2020-04-13,4/13/20,21.0,4/13/20,32.0,21.0,32.0
4,42,Afghanistan,4/14/20,33.000000,65.000000,714,2020-04-14,4/14/20,23.0,4/14/20,40.0,23.0,40.0
5,43,Afghanistan,4/15/20,33.000000,65.000000,784,2020-04-15,4/15/20,25.0,4/15/20,43.0,25.0,43.0
6,44,Afghanistan,4/2/20,33.000000,65.000000,273,2020-04-02,4/2/20,6.0,4/2/20,10.0,6.0,10.0
7,45,Afghanistan,4/3/20,33.000000,65.000000,281,2020-04-03,4/3/20,6.0,4/3/20,10.0,6.0,10.0
8,46,Afghanistan,4/4/20,33.000000,65.000000,299,2020-04-04,4/4/20,7.0,4/4/20,10.0,7.0,10.0
9,47,Afghanistan,4/5/20,33.000000,65.000000,349,2020-04-05,4/5/20,7.0,4/5/20,15.0,7.0,15.0


#### Upload data to SQL

In [16]:
for i in range(len(df_merge)):
    
    ## Check Country exisits
    Country = session.query(m.Country).filter(m.Country.country == df_merge["Country/Region"][i]).first()
    if Country is None:
        Country = m.Country(country = df_merge["Country/Region"][i],
                            lat = df_merge["Lat"][i],
                            long = df_merge["Long"][i]
                           )
        session.add(Country)
        session.commit()
    
    CasesGlobal = m.CasesGlobal( 
                        date = df_merge["date"][i],
                        confirmed = int(df_merge["value_x"][i]),
                        deaths = int(df_merge["deaths"][i]),
                        recovered = int(df_merge["recovered"][i]),
                        country_id = Country.id)
    
    
    session.add(CasesGlobal)
    print(CasesGlobal.date)
    
session.commit()
session.close()

2020-04-10 00:00:00
2020-04-11 00:00:00
2020-04-12 00:00:00
2020-04-13 00:00:00
2020-04-14 00:00:00
2020-04-15 00:00:00
2020-04-02 00:00:00
2020-04-03 00:00:00
2020-04-04 00:00:00
2020-04-05 00:00:00
2020-04-06 00:00:00
2020-04-07 00:00:00
2020-04-08 00:00:00
2020-04-09 00:00:00
2020-04-10 00:00:00
2020-04-11 00:00:00
2020-04-12 00:00:00
2020-04-13 00:00:00
2020-04-14 00:00:00
2020-04-15 00:00:00
2020-04-02 00:00:00
2020-04-03 00:00:00
2020-04-04 00:00:00
2020-04-05 00:00:00
2020-04-06 00:00:00
2020-04-07 00:00:00
2020-04-08 00:00:00
2020-04-09 00:00:00
2020-04-10 00:00:00
2020-04-11 00:00:00
2020-04-12 00:00:00
2020-04-13 00:00:00
2020-04-14 00:00:00
2020-04-15 00:00:00
2020-04-02 00:00:00
2020-04-03 00:00:00
2020-04-04 00:00:00
2020-04-05 00:00:00
2020-04-06 00:00:00
2020-04-07 00:00:00
2020-04-08 00:00:00
2020-04-09 00:00:00
2020-04-10 00:00:00
2020-04-11 00:00:00
2020-04-12 00:00:00
2020-04-13 00:00:00
2020-04-14 00:00:00
2020-04-15 00:00:00
2020-04-02 00:00:00
2020-04-03 00:00:00
