<a href="https://colab.research.google.com/github/carlosperezm/covid_time_series/blob/main/covid_time_series.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import packages

 - Data exploration
  - Plots
    - Stationary / non stationary
  -Outliers
- Split the database in training and test part
 - ARIMA
   - Characterization
   - 


In [22]:
import pandas as pd
import numpy as np
import requests
import altair as alt

In [23]:
BASE = "https://disease.sh/"
DISEASE = "v3/covid-19/"
HISTORICAL = "historical/"
COLOMBIA = "Colombia"
SPAIN = "Spain"
FRANCE = "France"

Get data from API and import in pandas dataframe

In [39]:
def load_data_covid(country=None):
    if type(country) is str:
      try:
          r = requests.get(f"{BASE}{DISEASE}{HISTORICAL}{country}", params={"lastdays": "all"})
          r.raise_for_status()
      except requests.exceptions.HTTPError as err:
          print(err)
          return
      else:
          df = pd.concat({r.json()['country']: pd.DataFrame.from_dict(r.json()["timeline"])}).reset_index()

    else:
      try:
          r = requests.get(f"{BASE}{DISEASE}{HISTORICAL}", params={"lastdays": "all"})
          r.raise_for_status()
      except requests.exceptions.HTTPError as err:
          print(err)
          return
      else:
          df = pd.concat({c['country']: pd.DataFrame.from_dict(c["timeline"]) for c in r.json()}).reset_index()

    cols={'level_0':'country', 'level_1': 'date', 'cases':'total_cases', 'deaths': 'total_deaths', 'recovered': 'total_recoveries'}
    df = df.rename(columns=cols)
    df["date"] = pd.to_datetime(df["date"])
    return df

In [25]:
def process_total_errors(full_data):
    cols= ['total_cases', 'total_deaths', 'total_recoveries']
    # Mask to identify incorrect subtraction when changing country in the table
    mask = full_data['country'] != full_data['country'].shift(1)
    for col in cols:
        full_data[col]= full_data[col].mask((full_data[col].diff()< 0), np.nan)
        full_data = full_data.fillna(method='ffill')
        full_data.loc[mask, col] = np.nan
        full_data = full_data.fillna(0)
    return full_data

In [26]:
def add_daily_data(df):
    # Calculate the diff for the total columns to get the new values
    temp = df.groupby(['country', 'date'])[['total_cases', 'total_deaths', 'total_recoveries']]
    temp = temp.sum().diff().reset_index()
    cols= ['new_cases', 'new_deaths', 'new_recoveries']
    temp.columns = ['country', 'date', *cols]

    # merge new cases with the original dataframe
    full_grouped = pd.merge(df, temp, on=['country', 'date'])

    # replace all negative values with zeros (happens in the first row of each country)
    full_grouped[cols] = full_grouped[cols].clip(lower=0)

    return full_grouped.convert_dtypes()

Reshape data for plot

In [27]:
full_data = load_data_covid()
full_data = process_total_errors(full_data)
full_data = add_daily_data(full_data)
full_data[full_data["country"] == "Yemen"]

Unnamed: 0,country,date,total_cases,total_deaths,total_recoveries,new_cases,new_deaths,new_recoveries
60536,Yemen,2020-01-22,0,0,0,0,0,0
60537,Yemen,2020-01-23,0,0,0,0,0,0
60538,Yemen,2020-01-24,0,0,0,0,0,0
60539,Yemen,2020-01-25,0,0,0,0,0,0
60540,Yemen,2020-01-26,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
60853,Yemen,2020-12-04,2267,627,1534,28,3,9
60854,Yemen,2020-12-05,2304,633,1547,37,6,13
60855,Yemen,2020-12-06,2337,639,1549,33,6,2
60856,Yemen,2020-12-07,2383,649,1554,46,10,5


Plot in altair

In [28]:
countries = ['US', 'Italy', 'Spain', 'Germany', 'Iran', 'UK', 'Colombia', 'France']
selected_countries = full_data[full_data['country'].isin(countries)]

Check that the plot is interactive

In [29]:
plots={}
for c in countries:
  plots[c] = (
      alt.Chart(full_data[full_data["country"].eq(c)].melt("date"))
      .mark_line()
      .encode(x="date:T", y="value:Q", color="variable")
      .properties(title=f"Covid cases {c}")
      .interactive()
  )

In [30]:
plots["Iran"]|plots["Colombia"]|plots["France"]|plots["Germany"]|plots["Spain"]

In [31]:
alt.Chart(selected_countries).mark_circle().encode(
    x='date:T',
    y='country',
    color='country',
    size=alt.Size('new_cases:Q',
        scale=alt.Scale(range=[0, 1000]),
        legend=alt.Legend(title='Daily new cases')
    ) 
).properties(
    width=800,
    height=400,
    title="Daily cases"
).interactive()