In [1]:
import requests
import json 
import pandas as pd

# Stage 1 : R&D & Data Verification

# Stage 2 : R&D (loops and cleaning)

# Stage 2 : Functionalize and Unit Tests 

In [2]:
# define our variables before running the function
countries = ['brunei','cambodia','indonesia','laos','malaysia','myanmar','philippines','singapore','thailand','vietnam']
required_cols =['Country','Cases','Date']
url_pattern = 'https://api.covid19api.com/total/country/{}/status/confirmed'
rename_list = ['brunei','laos','vietnam']

rename_country ={
'Brunei Darussalam':'Brunei',
'Lao PDR':'Laos',
'Viet Nam':'Vietnam'
}

In [3]:
def extract_API(countries,rename_country,url_pattern,required_cols):
    '''
    this function helps us to extract from the covid api
    
    ##inputs##
    countries a list of countries
    rename_country dictionary of old(key) and new(value)
    url_pattern string in this format 'https://api.covid19api.com/total/country/{}/status/confirmed'
    required_cols a list
    
    '''
    
    #unit test for input
    assert isinstance(countries, list),'tauke,needs to be a list lah'
    assert isinstance(required_cols, list),'ah bang,needs to be a list lah'
    assert isinstance(rename_country, dict),'aunty,needs to be a dictionary mai luan luan lai'
    assert isinstance(url_pattern, str),"chio bu, something like this https://api.covid19api.com/total/country/{}/status/confirmed"
    
    df = pd.DataFrame() #instantiate an empty DF
    for country in countries:
        
    # scrape data from api based on countries and concat accordingly to form a dataset
        url = url_pattern.format(country)
        response = requests.request("GET", url)
        parsed_data = json.loads(response.text)
        temp = pd.DataFrame.from_dict(parsed_data)

        # we only want columns that we cared for
        temp=temp[required_cols] 

        # since the data that we scraped is cumulative we need to take the difference
        # meaning if say each day has 1 case, the third day value is 1 and not 3
        temp['Cases'] = temp['Cases'].diff()
        temp.Cases.fillna(0) #since the first row is null as it's shifted by 'diff'

        if country in rename_list: # cleaning data by changing names
            temp['Country'] = temp['Country'].map(rename_country)
            df = pd.concat([df,temp])
    
    return df

df_from_ai = extract_API(countries,rename_country,url_pattern,required_cols)

In [4]:
def transform_api_data(df):
    '''
    input df_from_ai is a df that we want to transform
    
    this function helps to increase the resolution of the date field
    '''
    check_columns = ['Country', 'Cases', 'Date']
    assert df_from_ai.columns.to_list() == check_columns , "df columns with ['Country', 'Cases', 'Date']"
    
    #extract more resolution from date_time
    df['year'] = pd.DatetimeIndex(df['Date']).year
    df['month'] = pd.DatetimeIndex(df['Date']).month
    df['day'] = pd.DatetimeIndex(df['Date']).day
    df['quarter'] = pd.DatetimeIndex(df['Date']).quarter
    df['date'] = pd.DatetimeIndex(df['Date'])
    df['year_month'] = pd.DatetimeIndex(df['Date']).strftime('%Y-%m')
    
    # delete the not so useful column
    del df['Date']    
    return df

df = transform_api_data(df_from_ai)

## Simple checks

In [5]:
# samples 10 rows of data
df.sample(10)

Unnamed: 0,Country,Cases,year,month,day,quarter,date,year_month
388,Vietnam,53.0,2021,2,13,1,2021-02-13 00:00:00+00:00,2021-02
41,Brunei,0.0,2020,3,3,1,2020-03-03 00:00:00+00:00,2020-03
56,Vietnam,9.0,2020,3,18,1,2020-03-18 00:00:00+00:00,2020-03
28,Laos,0.0,2020,2,19,1,2020-02-19 00:00:00+00:00,2020-02
247,Vietnam,0.0,2020,9,25,3,2020-09-25 00:00:00+00:00,2020-09
188,Vietnam,15.0,2020,7,28,3,2020-07-28 00:00:00+00:00,2020-07
109,Vietnam,0.0,2020,5,10,2,2020-05-10 00:00:00+00:00,2020-05
490,Vietnam,155.0,2021,5,26,2,2021-05-26 00:00:00+00:00,2021-05
310,Laos,0.0,2020,11,27,4,2020-11-27 00:00:00+00:00,2020-11
224,Brunei,0.0,2020,9,2,3,2020-09-02 00:00:00+00:00,2020-09


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1668 entries, 0 to 555
Data columns (total 8 columns):
Country       1668 non-null object
Cases         1665 non-null float64
year          1668 non-null int64
month         1668 non-null int64
day           1668 non-null int64
quarter       1668 non-null int64
date          1668 non-null datetime64[ns, UTC]
year_month    1668 non-null object
dtypes: datetime64[ns, UTC](1), float64(1), int64(4), object(2)
memory usage: 117.3+ KB


In [7]:
# export extracted and cleaned content to csv
df.to_csv('covid.csv',index=False)