In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
%matplotlib inline

In [2]:
covid_url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/json/"
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import json
import urllib

In [3]:
covid_json_unformated = urllib.request.urlopen(covid_url).read().decode("utf-8")
covid_json = json.loads(covid_json_unformated)
cdf = pd.DataFrame(covid_json['records'])

In [4]:
# Renaming columns

cdf = cdf.rename(columns={
    'dateRep': 'date_reported',
    'year_week': 'year_and_week',
    'cases_weekly': 'cases_per_week',
    'deaths_weekly': 'deaths_per_week',
    'countriesAndTerritories': 'country',
    'geoId': 'iso_3166_1_alpha2',
    'countryterritoryCode': 'iso_3166_1_alpha3',
    'popData2019': 'population_2019',
    'continentExp': 'continent',
    'notification_rate_per_100000_population_14-days': 'incidence_14_days'
    })

In [5]:
cdf.date_reported = pd.to_datetime(cdf['date_reported'], format='%d/%m/%Y', errors='raise')
cdf['date_reported'].dt.day.head()

0     8
1     1
2    25
3    18
4    11
Name: date_reported, dtype: int64

In [6]:
cdf.dtypes

date_reported        datetime64[ns]
year_and_week                object
cases_per_week                int64
deaths_per_week               int64
country                      object
iso_3166_1_alpha2            object
iso_3166_1_alpha3            object
population_2019             float64
continent                    object
incidence_14_days            object
dtype: object

In [7]:
# Create column 'deltaTime_since_start_of_recording'

cdf[['deltaTime_since_start_of_recording']] = cdf.date_reported - cdf.date_reported.min()

In [8]:
# clean/fix data

cdf.cases_per_week.mask(cdf.cases_per_week < 0, pd.NA, inplace=True)
cdf.deaths_per_week.mask(cdf.deaths_per_week < 0, pd.NA, inplace=True)
cdf.incidence_14_days = cdf.incidence_14_days.fillna(0).replace("", 0).astype(float)
cdf.incidence_14_days.mask(cdf.incidence_14_days < 0, pd.NA, inplace=True)
cdf.population_2019.mask(cdf.population_2019 != 0)

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
10642   NaN
10643   NaN
10644   NaN
10645   NaN
10646   NaN
Name: population_2019, Length: 10647, dtype: float64

In [14]:
# create a dataframe weekly_reports, which only contains the relevant information for further analysis
# weekly_reports shows for each week of a year the corresponding values for each country
# (not neccesary, but its nice to have)

weekly_reports = cdf[['year_and_week', 'country', 'cases_per_week', 'deaths_per_week', 
                      'incidence_14_days', 'date_reported', 
                      'deltaTime_since_start_of_recording', 'continent', 'population_2019']] \
    .set_index(['year_and_week', 'country']) \
    .groupby(level=[0, 1]) \
    .first() \
    .reset_index()

weekly_reports[['year', 'week_of_year']] = weekly_reports.year_and_week.str.split('-', expand=True).astype('int')

weekly_reports.drop(columns='year_and_week', inplace=True)
weekly_reports.set_index(['year', 'week_of_year', 'country'], inplace=True)

weekly_reports

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,cases_per_week,deaths_per_week,incidence_14_days,date_reported,deltaTime_since_start_of_recording,continent,population_2019
year,week_of_year,country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020,1,Afghanistan,0.0,0.0,0.00,2020-01-06,0 days,Asia,38041757.0
2020,1,Algeria,0.0,0.0,0.00,2020-01-06,0 days,Africa,43053054.0
2020,1,Armenia,0.0,0.0,0.00,2020-01-06,0 days,Europe,2957728.0
2020,1,Australia,0.0,0.0,0.00,2020-01-06,0 days,Oceania,25203200.0
2020,1,Austria,0.0,0.0,0.00,2020-01-06,0 days,Europe,8858775.0
...,...,...,...,...,...,...,...,...,...
2021,5,Wallis_and_Futuna,4.0,0.0,0.00,2021-02-08,399 days,Oceania,
2021,5,Western_Sahara,0.0,0.0,0.00,2021-02-08,399 days,Africa,582458.0
2021,5,Yemen,6.0,0.0,0.03,2021-02-08,399 days,Asia,29161922.0
2021,5,Zambia,7591.0,73.0,92.31,2021-02-08,399 days,Africa,17861034.0


In [9]:
# add column which contains the difference of the 14-days incidence

cdf_incidence_diff = pd.DataFrame.copy(cdf)
grp = cdf.groupby(['country'])
for country in cdf['country'].unique():
    grp_sort_time = grp.get_group(country).sort_values('deltaTime_since_start_of_recording')
    for pos, incidence in enumerate(grp_sort_time['incidence_14_days']):
        if pos == 0:
            cdf_incidence_diff.loc[grp_sort_time.index[pos], 'incidence_14_days_diff'] = float('NaN')
        else:
            cdf_incidence_diff.loc[grp_sort_time.index[pos], 'incidence_14_days_diff'] = (grp_sort_time.iloc[pos]['incidence_14_days'] - 
                                                                                          grp_sort_time.iloc[pos-1]['incidence_14_days'])

In [10]:
# countries (grouped by continent) with the most drastic increase of the 14-days incidence

def top_x(df, x=1, col='incidence_14_days_diff'):
    return df.sort_values(col, ascending=False).head(x)

cdf_highest_incidence_diff = cdf_incidence_diff[['continent', 'country','incidence_14_days_diff']] \
                             .groupby(['continent']).apply(top_x)
cdf_highest_incidence_diff

Unnamed: 0_level_0,Unnamed: 1_level_0,continent,country,incidence_14_days_diff
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,8550,Africa,Seychelles,227.13
America,485,America,Aruba,504.18
Asia,4931,Asia,Israel,296.37
Europe,4403,Europe,Holy_See,1717.79
Oceania,3567,Oceania,French_Polynesia,455.45
Other,1876,Other,Cases_on_an_international_conveyance_Japan,0.0


In [12]:
# countries (grouped by continent) with the most drastic decrease of the 14-days incidence

def bottom_x(df, x=1, col='incidence_14_days_diff'):
    return df.sort_values(col, ascending=True).head(x)
cdf_lowest_incidence_diff = cdf_incidence_diff[['continent', 'country','incidence_14_days_diff']] \
                            .groupby(['continent']).apply(bottom_x)
cdf_lowest_incidence_diff

Unnamed: 0_level_0,Unnamed: 1_level_0,continent,country,incidence_14_days_diff
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,3201,Africa,Eswatini,-153.99
America,1005,America,Belize,-301.27
Asia,4943,Asia,Israel,-294.99
Europe,4401,Europe,Holy_See,-1717.79
Oceania,3563,Oceania,French_Polynesia,-523.13
Other,1876,Other,Cases_on_an_international_conveyance_Japan,0.0


In [13]:
# Identify the country with the highest fluctuation in 14-days incidence

fluctuation = cdf_highest_incidence_diff.nlargest(1, ['incidence_14_days_diff'])
fluctuation

Unnamed: 0_level_0,Unnamed: 1_level_0,continent,country,incidence_14_days_diff
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Europe,4403,Europe,Holy_See,1717.79


In [24]:
cdf.head(10)

Unnamed: 0,date_reported,year_and_week,cases_per_week,deaths_per_week,country,iso_3166_1_alpha2,iso_3166_1_alpha3,population_2019,continent,incidence_14_days,deltaTime_since_start_of_recording
0,2021-02-08,2021-05,238.0,8.0,Afghanistan,AF,AFG,38041757.0,Asia,1.33,399 days
1,2021-02-01,2021-04,267.0,16.0,Afghanistan,AF,AFG,38041757.0,Asia,2.58,392 days
2,2021-01-25,2021-03,713.0,43.0,Afghanistan,AF,AFG,38041757.0,Asia,3.34,385 days
3,2021-01-18,2021-02,557.0,45.0,Afghanistan,AF,AFG,38041757.0,Asia,3.24,378 days
4,2021-01-11,2021-01,675.0,71.0,Afghanistan,AF,AFG,38041757.0,Asia,4.15,371 days
5,2021-01-04,2020-53,902.0,60.0,Afghanistan,AF,AFG,38041757.0,Asia,7.61,364 days
6,2020-12-28,2020-52,1994.0,88.0,Afghanistan,AF,AFG,38041757.0,Asia,7.19,357 days
7,2020-12-21,2020-51,740.0,111.0,Afghanistan,AF,AFG,38041757.0,Asia,6.56,350 days
8,2020-12-14,2020-50,1757.0,71.0,Afghanistan,AF,AFG,38041757.0,Asia,9.01,343 days
9,2020-12-07,2020-49,1672.0,137.0,Afghanistan,AF,AFG,38041757.0,Asia,7.22,336 days


In [29]:
# only keep the information, that is relevant for the next tasks

cdf_important = cdf[['date_reported', 'country', 'deaths_per_week', 'cases_per_week', 'incidence_14_days', 
                     'population_2019', 'continent', 'deltaTime_since_start_of_recording' ]]

In [26]:
# line plot, that shows the 14-days incidence for all european countries

df_plot_Europe = cdf_important[['continent', 'country', 'incidence_14_days', 'date_reported']].groupby('continent')
df_plot_Europe = df_plot_Europe.get_group('Europe')
df_plot_Europe = df_plot_Europe.sort_values(['country', 'date_reported'])

fig = px.line(df_plot_Europe, 
              x='date_reported', y='incidence_14_days', 
              labels={'date_reported': 'date reported',
                     'incidence_14_days': '14-days incidence'},
              title = '14-days incidence of Covid-19 cases in different european countries', 
              color='country', template='plotly_dark')

# fig.show()

In [31]:
# add a column that contains the smoothed 14-days incidence

df_Europe_smoothed = cdf_important.groupby(['continent']) \
                                  .get_group('Europe').copy() 
df_Europe_smoothed.sort_values(['country', 'date_reported'], inplace=True)

df_Europe_smoothed_grouped = df_Europe_smoothed[['incidence_14_days', 'date_reported', 'country']] \
                             .groupby(['country'])
df_Europe_smoothed['incidence_14_days_smoothed'] = df_Europe_smoothed_grouped['incidence_14_days'] \
                                                   .rolling(window=13, center=True) \
                                                   .mean().values
df_Europe_smoothed.head(10)

Unnamed: 0,date_reported,country,deaths_per_week,cases_per_week,incidence_14_days,population_2019,continent,deltaTime_since_start_of_recording,incidence_14_days_smoothed
106,2020-03-09,Albania,0.0,2.0,0.0,2862427.0,Europe,63 days,
105,2020-03-16,Albania,1.0,40.0,1.47,2862427.0,Europe,70 days,
104,2020-03-23,Albania,1.0,47.0,3.04,2862427.0,Europe,77 days,
103,2020-03-30,Albania,8.0,123.0,5.94,2862427.0,Europe,84 days,
102,2020-04-06,Albania,11.0,149.0,9.5,2862427.0,Europe,91 days,
101,2020-04-13,Albania,2.0,85.0,8.17,2862427.0,Europe,98 days,
100,2020-04-20,Albania,3.0,116.0,7.02,2862427.0,Europe,105 days,5.731538
99,2020-04-27,Albania,2.0,164.0,9.78,2862427.0,Europe,112 days,6.397692
98,2020-05-04,Albania,3.0,69.0,8.14,2862427.0,Europe,119 days,7.316923
97,2020-05-11,Albania,0.0,73.0,4.96,2862427.0,Europe,126 days,8.913077


In [32]:
plot_data_Europe = df_Europe_smoothed.groupby(['country'])
fig = go.Figure()
for country in df_Europe_smoothed['country'].unique():
    country_group = plot_data_Europe.get_group(country).sort_values('deltaTime_since_start_of_recording')
    fig.add_trace(go.Scatter(x=country_group['date_reported'],
                             y=country_group['incidence_14_days_smoothed'],
                             mode='lines',
                             name=country))
    fig.update_layout(template="plotly_dark",
                      title="14-days incidence of Covid-19 cases in different european countries (averaged by 3 months)",
                      xaxis=dict(title='date reported'),
                      yaxis=dict(title='smoothed 14-days incidence'))
# fig.show()

In [33]:
# radial plot that shows the death rate per 100,000 people in Germany, Italy, Sweden and Greece 

mask_country = (cdf_important['country']=='Germany') | (cdf_important['country']=='Italy') | \
               (cdf_important['country']=='Sweden') | (cdf_important['country']=='Greece')
cdf_country = cdf_important[mask_country].copy()
grp_country = cdf_country[['deaths_per_week', 'population_2019', 'date_reported','country']].groupby(['country'])

radial_plot_data = []
for country, df in grp_country:
    day_in_year = df['date_reported'] - pd.to_datetime(2020, format='%Y')
    
    radial_plot_data.append(go.Scatterpolar(r=(df['deaths_per_week']*100000)/df['population_2019'],
                                               theta=day_in_year.dt.days * 360/365,
                                               name=country))
    
    layout = {'template' : 'plotly_dark',
              'title': {'text':'Death rate of Covid-19 cases for different european countries since 2020'},
              'polar': {'angularaxis': {'tickmode': 'array',
                                        'tickvals': [0, 72, 144, 216, 288],
                                        'ticktext': ['Day 0', 'Day 73', 'Day 146', 'Day 219', 'Day 292']
                                        },
              'radialaxis': {'dtick': 2,} 
                        }
             }
    
    fig = go.Figure(data=radial_plot_data, layout=layout)
# fig.show()