In [1]:
import random
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
from pandas import Timestamp
from datetime import datetime
from time import time
from os import getcwd
from os.path import join
%matplotlib inline


# 1. Motivation
-------
The reporting of COVID deaths may be be accurate for the following reason:
- Most of the COVID deaths are associated to other sickness.  It is hard to say it is really casused by COVID.
- Not all the deaths are diagonized for the real cause of the death.
    - Some families of the deaths do not want to do the test.
    - Test requires some additional cost.  This cost is not always justified, esp. in the situation of the shortage of the medical resources when there is serious pandemic or where medical resources is very poor.
    
Another way to determine COVID deaths is to find the excess deaths over the regular deaths in the time before COVID pandemic.   Therefore, we need some kind of data covering a few years before COVID and years in COVID.  The data is available in https://github.com/akarlinsky/world_mortality/blob/main/world_mortality.csv

# 2. Understanding of the data

In [None]:
# https://github.com/akarlinsky/world_mortality/blob/main/world_mortality.csv
# https://github.com/akarlinsky/world_mortality
path = join(getcwd().rstrip('src'), 'data/world_mortality.csv').replace('\\', '/')
DF = pd.read_csv(path)
# DF = pd.read_csv('~/AI/DATA/BigData/DeathBirthRate/world_mortality2015-20220214.csv')
DF.rename(columns = {'country_name':'country'}, inplace=True)
print(DF.head(10))


: 

: 

## 2.1 find out how many countries are included

In [None]:
AllCountries = set(DF.country)
print("# of countries included in the data is ", len(AllCountries))

: 

: 

In [None]:
print(DF.year.min(), DF.year.max())

: 

: 

In [None]:
# # of countries included in 2015
len(set(DF[DF.year == 2015].country))

: 

: 

In [None]:
# # of countries included in 2015
len(set(DF[DF.year == 2015].country))

: 

: 

In [None]:
# # of countries included in 2022
len(set(DF[DF.year == 2022].country))

: 

: 

In [None]:
# # of countries included in 2021
len(set(DF[DF.year == 2021].country))

: 

: 

In [None]:
# # of countries included in 2020
len(set(DF[DF.year == 2020].country))

: 

: 

## 2.2 Find out how many different time_unit
---
We have seen "monthly" time_unit.  Let us find out if there is any other time_unit.

In [None]:
# # of countries included in 2020
set(DF.time_unit)

: 

: 

## 2.3 We have seen only two time_unit.  Let us find out if the values are complete for each country

In [None]:
set(DF[DF.year==2020].groupby(['country', 'time_unit']).year.count())
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.count.html
# can distinguish nan and non-nan

: 

: 

### Since the above return 8, 12 and 53 in the year of 2020.  It is probably that some countries with monthly time_unit do not have complete vaules for all months in 2020.

In [None]:
DF[DF.year==2020].groupby(['country', 'time_unit']).count().head()

: 

: 

In [None]:
tmp = DF[DF.year==2020].groupby(['country', 'time_unit']).count()
tmp = tmp.reset_index()
tmp.head()

: 

: 

In [None]:
# The following shows that there is only one country does not have complete list for the year of 2020
tmp[tmp.year == 8]

: 

: 

# 3. Find average annual excess deaths for the years of 2020 and 2021
------
Since 2022 is not finished yet and there is only 90 countries have values in 2022 (even though it is not complete yet), let us focus on the average annual excess deaths for 2020 and 2021.
The simplest way to find the excess deaths is:
1. find the average annual deaths before 2020, i.e. years from 2015 to 2019.  This is the regualr deaths.
- find the average annual deaths for 2020 and 2021. This is the deaths from all causes.
- The difference between the deaths from all causes and the regular deaths is the excess deaths.
- If there is no major event for large scale deaths, then excess deaths should be COVID deaths
- Examples of large scale deaths are big natural disasters like earthquake, tsunami, pandemic, or pandemic and war.  There are no such incidents in 2020 and 2021 other than COVID.   Therefore, this excess deaths must be caused by COVID pandemic.  Ukraine war occurred in 2022.

In [None]:
DF_2015_2019 = DF[DF.year < 2020]
DF_2020_2021 = DF[(DF.year == 2020) | (DF.year==2021)]


: 

: 

In [None]:
Regular = DF_2015_2019.groupby('country').mean().reset_index()
print(Regular.head(30))

: 

: 

In [None]:
DF[(DF.country == 'Australia') & (DF.year < 2020)]

: 

: 

In [None]:
DF[(DF.country == 'Australia') & (DF.year < 2022) & (DF.time >= 52)]

: 

: 

### It looks like that:
- when the average of time is 6.5 means that it has complete year from month 1 to month 12 so that the average is 6.5.   The annual average deaths should be 12X of this average.
- when the average of time is 26.60 means that it has complete year from week 1 to week 52 or 53 so that the average is 26.60.  If all year has 52 weeks, this value should be 26.5.  Let us assume there is alway 52 weeks for simplicity.  The annual average deaths should be very close to 52X of this average.

Let us add one column "AverageAnnualUnitCount" to represent the actual number of values in the data for the years we are working on.  The value can help us to know how many values of time_unit we have in the data.   Even though the annual average can be obtained by just the average in the specified time_unit multiplied by 12 or 52 even if some countries do not have complete values for a whole year.


In [None]:
Regular['AverageAnnualUnitCount'] = round(Regular.time * 2 - 1, 0)

: 

: 

In [None]:
Regular.head()

: 

: 

## The above Regular DataFrame does not have "time_unit"  because it is not numerical and is deleted when we obtain mean or count.  However, it is still better to know what kind of time_unit each country is reporting.   This time_unit can be insert back to Regular by merge method in Pandas.

In [None]:
# obtain time_unit for each country.  We can obtain this value just once by specifying 

import copy as copy 
tmp = copy.copy(DF)
# tmp['time_unit'] = [ 12 for x in tmp.time_unit if x == 'monthly' else 52 ]
time_unit_dict = {'monthly': 12, 'weekly': 52}
tmp['AnnualUnitCount'] = [time_unit_dict[x] for x in tmp.time_unit ]
tmp.head()

: 

: 

In [None]:
# We only need one value of AnnualUnitCount for each country.  A quick way to get it is 
tmp = tmp.groupby('country').mean().reset_index()[['country', 'AnnualUnitCount']]
tmp.head()

: 

: 

In [None]:
Regular.head()

: 

: 

In [None]:
Regular = pd.merge(Regular, tmp, on='country')
Regular.head()

: 

: 

In [None]:
# change deaths to annual deaths
Regular['deaths'] = Regular.deaths * Regular.AnnualUnitCount
Regular.head()

: 

: 

In [None]:
Regular.columns

: 

: 

In [None]:
# we don't need everything.  Just select some columns
Regular = Regular[['country', 'deaths', 'AverageAnnualUnitCount']]

: 

: 

In [None]:
Regular.head()

: 

: 

# 4. Now let us do the same process on DF_2020_2021  for AllCauses

In [None]:
Irregular = DF_2020_2021.groupby('country').mean().reset_index()
Irregular['AverageAnnualUnitCount'] = round(Irregular.time * 2 - 1, 0)
Irregular = pd.merge(Irregular, tmp, on='country')
Irregular['deaths'] = Irregular.deaths * Irregular.AnnualUnitCount
Irregular = Irregular[['country', 'deaths', 'AverageAnnualUnitCount']]
Irregular.head()


: 

: 

# 5. Combine Regular and AllCauses
### The first deaths and AverageAnnualUnitCount are for Regular, the 2nd is for AllCauses


In [None]:
newDF = pd.concat([Regular, Irregular[['deaths', 'AverageAnnualUnitCount']]], axis=1)
newDF.columns = ['country', 'RegularDeaths', 'RegularAverageAnnualUnitCount', 'IrregularDeaths', 'IrregularAverageAnnualUnitCount']
newDF.head()

: 

: 

# 6. Get excess deaths


In [None]:
newDF = pd.concat([newDF, (newDF.IrregularDeaths - newDF.RegularDeaths)], axis=1)
newDF = pd.concat([newDF, (newDF.IrregularDeaths - newDF.RegularDeaths) / newDF.RegularDeaths], axis=1)
newDF.columns = ['country', 'RegularDeaths', 'RegularAverageAnnualUnitCount', 'IrregularDeaths', 'IrregularAverageAnnualUnitCount', 'ExcessDeaths', 'ExcessDeathRate']
newDF.head()

: 

: 

# 7. Get population so that we can calculate deaths per million population from OWID dataset

In [None]:
path = join(getcwd().rstrip('src'),
            'data/owid-covid-data.csv').replace('\\', '/')
data = pd.read_csv(path)

: 

: 

In [None]:
data = data[['location', 'population']]
data.rename(columns = {'location':'country'}, inplace=True)
data.head()


: 

: 

In [None]:
data.groupby('country').count().sort_values(by='population', ascending=False)
data.drop_duplicates(subset=['country'], inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))
data.head()

: 

: 

# 8. merge population

In [None]:
con1 = list(data.country)
con2 = list(newDF.country)
pop = []
for c in con2:
    if c in con1:
        pop.append(data[data.country == c].population.values[0])
    else:
        pop.append(0)
pop = pd.Series(pop, name='population')
newDF = pd.concat([newDF, pop], axis=1)
newDF.head()

: 

: 

# 9. calculate deaths per million population and sort by it

: 

: 

: 

: 