In [1]:
import gc
import os
from pathlib import Path
import random
import sys
from tqdm import tqdm
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from IPython.core.display import display, HTML

# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.templates.default = "plotly_dark"

# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# --- setup ---
pd.set_option('max_columns', 50)

  import pandas.util.testing as tm


### 載入資料

使用到的資料集: [COVID-19/csse_covid_19_data/csse_covid_19_time_series/](https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series)

- 全球
    - `confirmed_global_df` : dataframe，儲存從 2020/1/22 到 2020/5/30 各國確診人數

    - `deaths_global_df` : dataframe，儲存從 2020/1/22 到 2020/5/30 各國死亡人數

    - `recovered_global_df` : dataframe，儲存從 2020/1/22 到 2020/5/30 各國康復人數
- 美國
    - `confirmed_us_df` : dataframe，儲存從 2020/1/22 到 2020/5/30 美國確診人數
    
    - `deaths_us_df` : dataframe，儲存從 2020/1/22 到 2020/5/30 美國死亡人數

In [2]:
import requests

# for filename in ['time_series_covid19_confirmed_global.csv',
#                  'time_series_covid19_deaths_global.csv',
#                  'time_series_covid19_recovered_global.csv',
#                  'time_series_covid19_confirmed_US.csv',
#                  'time_series_covid19_deaths_US.csv']:
#     print(f'Downloading {filename}')
#     url = f'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/{filename}'
#     myfile = requests.get(url)
#     open(filename, 'wb').write(myfile.content)

confirmed_global_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
deaths_global_df = pd.read_csv('time_series_covid19_deaths_global.csv')
recovered_global_df = pd.read_csv('time_series_covid19_recovered_global.csv')

### 資料前處理

變更日期的格式，由 mm/dd/yy 改成 yy-mm-dd

In [3]:
def _convert_date_str(df):
    try:
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]
    except:
        print('_convert_date_str failed with %y, try %Y')
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%Y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]

In [4]:
_convert_date_str(confirmed_global_df)
_convert_date_str(deaths_global_df)
_convert_date_str(recovered_global_df)

將鑽石公主號的資料移除，約旦河西岸和加薩走廊的資料包含負值，所以一併移除

In [5]:
# Filter out problematic data points (The West Bank and Gaza had a negative value, cruise ships were associated with Canada, etc.)
removed_states = "Recovered|Grand Princess|Diamond Princess"
removed_countries = "US|The West Bank and Gaza"

confirmed_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)
deaths_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)
recovered_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)

confirmed_global_df = confirmed_global_df[~confirmed_global_df["Province_State"].replace(np.nan, "nan").str.match(removed_states)]
deaths_global_df    = deaths_global_df[~deaths_global_df["Province_State"].replace(np.nan, "nan").str.match(removed_states)]
recovered_global_df = recovered_global_df[~recovered_global_df["Province_State"].replace(np.nan, "nan").str.match(removed_states)]

confirmed_global_df = confirmed_global_df[~confirmed_global_df["Country_Region"].replace(np.nan, "nan").str.match(removed_countries)]
deaths_global_df    = deaths_global_df[~deaths_global_df["Country_Region"].replace(np.nan, "nan").str.match(removed_countries)]
recovered_global_df = recovered_global_df[~recovered_global_df["Country_Region"].replace(np.nan, "nan").str.match(removed_countries)]

將所有日期合併到同一欄位，該日期的累積人數合併到另一欄位

In [6]:
confirmed_global_melt_df = confirmed_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='ConfirmedCases')

deaths_global_melt_df = deaths_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='Deaths')

recovered_global_melt_df = deaths_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='Recovered')

recovered_global_melt_df.head()

Unnamed: 0,Country_Region,Province_State,Lat,Long,Date,Recovered
0,Afghanistan,,33.0,65.0,2020-01-22,0
1,Albania,,41.1533,20.1683,2020-01-22,0
2,Algeria,,28.0339,1.6596,2020-01-22,0
3,Andorra,,42.5063,1.5218,2020-01-22,0
4,Angola,,-11.2027,17.8739,2020-01-22,0


In [7]:
train = confirmed_global_melt_df.merge(deaths_global_melt_df, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])
train = train.merge(recovered_global_melt_df, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])

In [8]:
# --- US ---
confirmed_us_df = pd.read_csv('time_series_covid19_confirmed_US.csv')
deaths_us_df = pd.read_csv('time_series_covid19_deaths_US.csv')

# 丟掉不須用到的欄位
confirmed_us_df.drop(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Combined_Key'], inplace=True, axis=1)
deaths_us_df.drop(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Combined_Key', 'Population'], inplace=True, axis=1)

# 將 Long_ 欄位改名為 Long
confirmed_us_df.rename({'Long_': 'Long'}, axis=1, inplace=True)
deaths_us_df.rename({'Long_': 'Long'}, axis=1, inplace=True)

# 變更日期的格式，由 mm/dd/yy 改成 yy-mm-dd
_convert_date_str(confirmed_us_df)
_convert_date_str(deaths_us_df)

# clean
# 丟掉不屬於美國的地區
confirmed_us_df = confirmed_us_df[~confirmed_us_df.Province_State.str.match("Diamond Princess|Grand Princess|Recovered|Northern Mariana Islands|American Samoa")]
deaths_us_df = deaths_us_df[~deaths_us_df.Province_State.str.match("Diamond Princess|Grand Princess|Recovered|Northern Mariana Islands|American Samoa")]

# --- Aggregate by province state ---
#confirmed_us_df.groupby(['Country_Region', 'Province_State'])
confirmed_us_df = confirmed_us_df.groupby(['Country_Region', 'Province_State']).sum().reset_index()
deaths_us_df = deaths_us_df.groupby(['Country_Region', 'Province_State']).sum().reset_index()

# remove lat, long.
confirmed_us_df.drop(['Lat', 'Long'], inplace=True, axis=1)
deaths_us_df.drop(['Lat', 'Long'], inplace=True, axis=1)

# 合併日期
confirmed_us_melt_df = confirmed_us_df.melt(
    id_vars=['Country_Region', 'Province_State'], value_vars=confirmed_us_df.columns[2:], var_name='Date', value_name='ConfirmedCases')
deaths_us_melt_df = deaths_us_df.melt(
    id_vars=['Country_Region', 'Province_State'], value_vars=deaths_us_df.columns[2:], var_name='Date', value_name='Deaths')

# 將美國確診及死亡的資料合併
train_us = confirmed_us_melt_df.merge(deaths_us_melt_df, on=['Country_Region', 'Province_State', 'Date'])

In [9]:
train = pd.concat([train, train_us], axis=0, sort=False)

train_us.rename({'Country_Region': 'country', 'Province_State': 'province', 'Date': 'date', 'ConfirmedCases': 'confirmed', 'Deaths': 'fatalities'}, axis=1, inplace=True)
train_us['country_province'] = train_us['country'].fillna('') + '/' + train_us['province'].fillna('')

In [10]:
train.head()

Unnamed: 0,Country_Region,Province_State,Lat,Long,Date,ConfirmedCases,Deaths,Recovered
0,Afghanistan,,33.0,65.0,2020-01-22,0,0,0.0
1,Albania,,41.1533,20.1683,2020-01-22,0,0,0.0
2,Algeria,,28.0339,1.6596,2020-01-22,0,0,0.0
3,Andorra,,42.5063,1.5218,2020-01-22,0,0,0.0
4,Angola,,-11.2027,17.8739,2020-01-22,0,0,0.0


In [11]:
train.rename({'Country_Region': 'country', 'Province_State': 'province', 'Id': 'id', 'Date': 'date', 'ConfirmedCases': 'confirmed', 'Deaths': 'fatalities', 'Recovered': 'recovered'}, axis=1, inplace=True)
train['country_province'] = train['country'].fillna('') + '/' + train['province'].fillna('')

# test.rename({'Country_Region': 'country', 'Province_State': 'province', 'Id': 'id', 'Date': 'date', 'ConfirmedCases': 'confirmed', 'Fatalities': 'fatalities'}, axis=1, inplace=True)
# test['country_province'] = test['country'].fillna('') + '/' + test['province'].fillna('')
train.head()

Unnamed: 0,country,province,Lat,Long,date,confirmed,fatalities,recovered,country_province
0,Afghanistan,,33.0,65.0,2020-01-22,0,0,0.0,Afghanistan/
1,Albania,,41.1533,20.1683,2020-01-22,0,0,0.0,Albania/
2,Algeria,,28.0339,1.6596,2020-01-22,0,0,0.0,Algeria/
3,Andorra,,42.5063,1.5218,2020-01-22,0,0,0.0,Andorra/
4,Angola,,-11.2027,17.8739,2020-01-22,0,0,0.0,Angola/


## 綜觀全球疫情走勢

觀察 `ww_df` 的各個屬性

In [12]:
ww_df = train.groupby('date')[['confirmed', 'fatalities']].sum().reset_index()
ww_df['new_case'] = ww_df['confirmed'] - ww_df['confirmed'].shift(1)
ww_df['growth_factor'] = ww_df['new_case'] / ww_df['new_case'].shift(1)
ww_df.tail()

Unnamed: 0,date,confirmed,fatalities,new_case,growth_factor
138,2020-06-08,7119002,406568,103776.0,0.914713
139,2020-06-09,7242328,411452,123326.0,1.188387
140,2020-06-10,7376138,417063,133810.0,1.08501
141,2020-06-11,7514529,421455,138391.0,1.034235
142,2020-06-12,7632607,425388,118078.0,0.85322


觀察 `ww_melt_df` 的各個屬性

In [13]:
ww_melt_df = pd.melt(ww_df, id_vars=['date'], value_vars=['confirmed', 'fatalities', 'new_case'])
ww_melt_df

Unnamed: 0,date,variable,value
0,2020-01-22,confirmed,555.0
1,2020-01-23,confirmed,654.0
2,2020-01-24,confirmed,941.0
3,2020-01-25,confirmed,1434.0
4,2020-01-26,confirmed,2118.0
...,...,...,...
424,2020-06-08,new_case,103776.0
425,2020-06-09,new_case,123326.0
426,2020-06-10,new_case,133810.0
427,2020-06-11,new_case,138391.0


### 全球確診/死亡案例 (折線圖)

- 2020/4/2 確診人數突破 1M，且死亡人數為 52K
- 2020/5/1 確診人數突破 3.3M，且死亡人數為 238K
- **好消息! 每日新增確診案例曲線從 2020/4/4 開始趨於平緩至今**

In [14]:
fig = px.line(ww_melt_df, x="date", y="value", color='variable', 
              title="Worldwide Confirmed/Death Cases Over Time")
fig.show()

### 全球確診/死亡案例 (折線圖) (取log)

- 比較 2020/3 初和 2020/3 底，確診案例成長率的上升速度略為增加

In [15]:
fig = px.line(ww_melt_df, x="date", y="value", color='variable',
              title="Worldwide Confirmed/Death Cases Over Time (Log scale)",
             log_y=True)
fig.show()

### 全球死亡率 (折線圖)

- 可以明顯看到，死亡率在 2020/5 開始下降

In [16]:
ww_df['mortality'] = ww_df['fatalities'] / ww_df['confirmed']

fig = px.line(ww_df, x="date", y="mortality", 
              title="Worldwide Mortality Rate Over Time")
fig.show()

列出有多少國家位於何種確診案例的數量等級

In [17]:
country_df = train.groupby(['date', 'country'])[['confirmed', 'fatalities']].sum().reset_index()
target_date = country_df['date'].max()

print('Date: ', target_date)
for i in [1, 10, 100, 1000, 10000]:
    n_countries = len(country_df.query('(date == @target_date) & confirmed > @i'))
    print(f'{n_countries} countries have more than {i} confirmed cases')

Date:  2020-06-12
188 countries have more than 1 confirmed cases
184 countries have more than 10 confirmed cases
163 countries have more than 100 confirmed cases
120 countries have more than 1000 confirmed cases
59 countries have more than 10000 confirmed cases


列出所有國家

In [18]:
countries = country_df['country'].unique()
print(f'{len(countries)} countries are in dataset:\n{countries}')

188 countries are in dataset:
['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina'
 'Botswana' 'Brazil' 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burma' 'Burundi'
 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada' 'Central African Republic'
 'Chad' 'Chile' 'China' 'Colombia' 'Comoros' 'Congo (Brazzaville)'
 'Congo (Kinshasa)' 'Costa Rica' "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus'
 'Czechia' 'Denmark' 'Diamond Princess' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Fiji' 'Finland' 'France'
 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada'
 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti' 'Holy See'
 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq'
 'Ireland' 'Israel'

### 目前確診案例前 30 國家 (折線圖)

In [19]:
country_df = train.groupby(['date', 'country'])[['confirmed', 'fatalities']].sum().reset_index()
top_country_df = country_df.query('(date == @target_date) & (confirmed > 1000)').sort_values('confirmed', ascending=False)

In [20]:
shapes = []
for i in (20, 40, 60):
    shapes.append({'type': 'line', 'xref': 'x', 'yref': 'y', 'x0': i, 'y0': 0, 'x1': i, 'y1': 1})
layout = go.Layout(shapes = shapes)

# begin -------------------------------------------------------

# 台灣

In [86]:
from datetime import timedelta

# plot function
def plot_by_country(country_str, events, anno_pos = 0.5, anno_angle = -90):
    country_data = pd.DataFrame()
    for i in range(len(country_df)):
        if country_df['country'][i] == country_str:
            country_data = country_data.append(country_df.iloc[i])
    country_data.index = np.arange(len(country_data))
    fig = px.line(country_data,
                  x='date', y='confirmed', color='country',
                  title=f'Confirmed Cases of {country_str} till {target_date}',
                  height = 500)
    ymax = country_data['confirmed'][len(country_data) - 1]
    for event in events:
        fig.add_trace(go.Scatter(x=['2020-' + event[0], '2020-' + event[0]], y=[0, ymax], name=event[1],
                                 mode = 'lines', line=dict(dash='dash', color=(event[2]))))
        yesterday = (datetime.strptime('2020-' + event[0], "%Y-%m-%d").date() - timedelta(days=1)).strftime("%Y-%m-%d")
        fig.add_annotation(x=yesterday, y=ymax * anno_pos, text=event[0] + ' ' + event[1],
                           showarrow=False, textangle=anno_angle, bgcolor = '#101010')
    fig.show()
    
# plot function (log)
def plot_by_country2(country_str, events, anno_pos = 0.5, anno_angle = -90):
    country_data = pd.DataFrame()
    for i in range(len(country_df)):
        if country_df['country'][i] == country_str:
            country_data = country_data.append(country_df.iloc[i])
    country_data.index = np.arange(len(country_data))
    country_data['confirmed'] = country_data['confirmed'].apply(lambda x : np.log10(x + 1))
    fig = px.line(country_data,
                  x='date', y='confirmed', color='country',
                  title=f'Confirmed Cases of {country_str} till {target_date} (log)',
                  height = 500)
    ymax = country_data['confirmed'][len(country_data) - 1]
    for event in events:
        fig.add_trace(go.Scatter(x=['2020-' + event[0], '2020-' + event[0]], y=[0, ymax], name=event[1],
                                 mode = 'lines', line=dict(dash='dash', color=(event[2]))))
        yesterday = (datetime.strptime('2020-' + event[0], "%Y-%m-%d").date() - timedelta(days=1)).strftime("%Y-%m-%d")
        fig.add_annotation(x=yesterday, y=ymax * anno_pos, text=event[0] + ' ' + event[1],
                           showarrow=False, textangle=anno_angle, bgcolor = '#101010')
    fig.show()

# plot function (log slope)
def plot_by_country3(country_str, events, anno_pos = 0.5, anno_angle = -90):
    country_data = pd.DataFrame()
    for i in range(len(country_df)):
        if country_df['country'][i] == country_str:
            country_data = country_data.append(country_df.iloc[i])
    country_data.index = np.arange(len(country_data))
    country_data['confirmed'] = country_data['confirmed'].apply(lambda x : np.log10(x + 1))
    slope = [0]
    for i in range(1, len(country_data)):
        slope.append(country_data['confirmed'][i] - country_data['confirmed'][i - 1])
    country_data['slope'] = slope
    fig = px.line(country_data,
                  x='date', y='slope', color='country',
                  title=f'Confirmed Cases of {country_str} till {target_date} (log slope)',
                  height = 500)
    ymax = country_data['slope'][len(country_data) - 1]
    line_y = anno_pos*0.5 if anno_pos > 1 else 0.5
    for event in events:
        fig.add_trace(go.Scatter(x=['2020-' + event[0], '2020-' + event[0]], y=[0, line_y], name=event[1],
                                 mode = 'lines', line=dict(dash='dash', color=(event[2]))))
        yesterday = (datetime.strptime('2020-' + event[0], "%Y-%m-%d").date() - timedelta(days=1)).strftime("%Y-%m-%d")
        fig.add_annotation(x=yesterday, y=float(0.5 * anno_pos), text=event[0] + ' ' + event[1],
                           showarrow=False, textangle=anno_angle, bgcolor = '#101010')
    fig.show()

events = [['02-11', '事件1', '#ffff00'],
          ['03-22', '事件2', 'rgb(200, 200, 200)']]
plot_by_country('Taiwan*', events)
plot_by_country2('Taiwan*', events, 0.4, -60)
plot_by_country3('Taiwan*', events, 0.75, -75)

# 美國

In [76]:
events = [['02-11', '事件', 'rgb(200, 200, 200)']]
plot_by_country('US', events)
plot_by_country2('US', events)
plot_by_country3('US', events)

# 俄羅斯

In [77]:
events = [['02-11', '事件', 'rgb(200, 200, 200)']]
plot_by_country('Russia', events)
plot_by_country2('Russia', events)
plot_by_country3('Russia', events)

# 南韓

In [81]:
events = [['02-11', '事件', 'rgb(200, 200, 200)']]
plot_by_country('Korea, South', events)
plot_by_country2('Korea, South', events)
plot_by_country3('Korea, South', events, 1.2, -60)