In [72]:
import pandas as pd
import requests
import io
from plotnine import ggplot, aes, geom_line

jh_covid19_base_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series'
jh_covid19_datasets = ['time_series_covid19_confirmed_global.csv', 'time_series_covid19_deaths_global.csv', 'time_series_covid19_confirmed_US.csv', 'time_series_covid19_deaths_US.csv']

def df_between_dates(df: pd.DataFrame, start_date: str, end_date: str) -> pd.DataFrame:
    if 'date' not in list(df.columns):
        return df
    date_range = (df['date'] > start_date) & (df['date'] <= end_date)
    return df[date_range]

def tweak_cigar(df: pd.DataFrame)->DataFrame:
    return df.rename(columns={
        'Country':'country',
        'Cigarette Consumption':'consumption'
    }).astype({
        'country': 'category',
        'consumption': 'float16'
    }).sort_values(
        'consumption', ascending=False
    ).dropna()

def tweak_covid19_dataset(df: pd.DataFrame, name: str)->DataFrame:
    df = df.drop(
        columns=['Province/State', 'Lat', 'Long']
    ).rename(columns={
        'Country/Region':'country'
    })
    return pd.melt(
        df,
        id_vars=['country'],
        value_vars=df.columns[1:], 
        var_name='date',
        value_name=f'total_{name}'
    ).groupby(
        ['country','date'], as_index=False
    ).agg({
        f'total_{name}': 'sum', 'country': 'first'
    }).fillna(0)

def merge_dfs(df_a: pd.DataFrame, df_b: pd.DataFrame, _on: list, _how: str='inner') -> pd.DataFrame:
    return pd.merge(df_a, df_b, how=_how, on=_on)

def get_csv(url) -> DataFrame:
    headers = {'User-Agent': 'Mozilla/5.0'}
    content = io.StringIO(requests.get(url, headers=headers).content.decode('utf-8'))
    return pd.read_csv(content)
    
confirmed_global, deaths_global, confirmed_US, deaths_US = (
    get_csv(f'{jh_covid19_base_url}/{data_set}') for data_set in jh_covid19_datasets
)

avg_daily_cigar_consumption = tweak_cigar(get_csv("https://tobaccoatlas.org/wp-content/uploads/2022/05/data-z9rR9.csv"))
weekly_excercise_hours_mean = pd.read_csv('./data/ipsos_global_views_exercise.csv')
confirmed_global = tweak_covid19_dataset(confirmed_global, 'cases')
deaths_global = tweak_covid19_dataset(deaths_global, 'deaths')
covid_stats = pd.merge(confirmed_global, deaths_global, how='inner', on=['date', 'country']).astype({
    'date':'datetime64',
    'total_deaths':'int64',
    'total_cases':'int64',
    'country':'category'
}).pipe(
    merge_dfs, df_b=avg_daily_cigar_consumption, _on=['country']
).pipe(
    merge_dfs, df_b=weekly_excercise_hours_mean, _on=['country']
).pipe(df_between_dates, '2020-01-01', '2020-12-31')

In [71]:
covid_stats

Unnamed: 0,date,total_cases,country,total_deaths,consumption,weekly_excercise_hours_mean
28,2020-01-22,0,Argentina,0,978.5,4.4
31,2020-01-23,0,Argentina,0,978.5,4.4
34,2020-01-24,0,Argentina,0,978.5,4.4
37,2020-01-25,0,Argentina,0,978.5,4.4
40,2020-01-26,0,Argentina,0,978.5,4.4
...,...,...,...,...,...,...
26583,2020-09-05,278228,Turkey,6620,1542.0,5.8
26586,2020-09-06,279806,Turkey,6673,1542.0,5.8
26589,2020-09-07,281509,Turkey,6730,1542.0,5.8
26592,2020-09-08,283270,Turkey,6782,1542.0,5.8
