In [1]:
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 21 11:33:12 2021

@author: BRSch
"""

import pandas as pd
import numpy as np
from tabulate import tabulate

data = pd.read_csv("owid-covid-data.csv")

data = data[
    (data["iso_code"] == "POL") | (data["iso_code"] == "USA")]
# data contains data from both the Netherlands and the USA.
data = data[["iso_code", "location", "date", "total_cases", "new_cases", "total_deaths", "new_deaths",
             "new_deaths_smoothed", "total_cases_per_million", "new_cases_per_million",
             "new_cases_smoothed_per_million", "total_deaths_per_million", "new_deaths_per_million",
             "new_deaths_smoothed_per_million", "reproduction_rate", "icu_patients", "icu_patients_per_million",
             "hosp_patients", "hosp_patients_per_million", "weekly_icu_admissions", "weekly_icu_admissions_per_million",
             "weekly_hosp_admissions", "weekly_hosp_admissions_per_million", "new_tests", "total_tests",
             "total_tests_per_thousand", "new_tests_per_thousand", "new_tests_smoothed",
             "new_tests_smoothed_per_thousand", "positive_rate",  "total_vaccinations",
             "people_vaccinated", "people_fully_vaccinated", "new_vaccinations", "new_vaccinations_smoothed",
             "total_vaccinations_per_hundred", "people_vaccinated_per_hundred", "people_fully_vaccinated_per_hundred",
             "new_vaccinations_smoothed_per_million", "stringency_index", "population"]]

# =============================================================================
# Vaccination data preparation
# =============================================================================

USA_data= data[data['iso_code']== 'USA']

USA_vaccination_data= USA_data[['date', 'people_fully_vaccinated_per_hundred']]

# Start the dataset at the first non-NaN value, assuming this is the point where they started vaccinating
first_vaccination= 358
USA_vaccination_data= USA_vaccination_data.iloc[first_vaccination:]

USA_vaccination_data= USA_vaccination_data.reset_index(drop= True)
# All other NaN values are substituted by the mean of the surrounding values,
# assuming this is representative since vaccination cover variables increase relatively constant (over small time scales)

USA_vaccination_data= USA_vaccination_data.fillna(0)
# NaN's set to 0 for easier computing

for i in range(len(USA_vaccination_data)):
    if USA_vaccination_data.iat[i, 1]== 0:
        j= 1
        while USA_vaccination_data.iat[i+j, 1]== 0:
            j+= 1
        prev_value= USA_vaccination_data.iat[i-1, 1]
        next_value= USA_vaccination_data.iat[i+j, 1]
        substitute= np.mean([prev_value, next_value])
        USA_vaccination_data.iat[i, 1]= substitute


POL_data= data[data['iso_code']== 'POL']

POL_vaccination_data= POL_data[['date', 'people_fully_vaccinated_per_hundred']]
POL_vaccination_data= POL_vaccination_data.reset_index(drop= True)
# Start the dataset at the first non-NaN value, assuming this is the point where they started vaccinating
first_vaccination= 320
POL_vaccination_data= POL_vaccination_data.iloc[first_vaccination:]

POL_vaccination_data= POL_vaccination_data.reset_index(drop= True)
# All other NaN values are substituted by the mean of the surrounding values,
# assuming this is representative since vaccination cover variables increase relatively constant (over small time scales)

POL_vaccination_data= POL_vaccination_data.fillna(0)
# NaN's set to 0 for easier computing

for i in range(len(POL_vaccination_data)):
    if POL_vaccination_data.iat[i, 1]== 0:
        prev_value= POL_vaccination_data.iat[i-1, 1]
        next_value= POL_vaccination_data.iat[i+1, 1]
        substitute= np.mean([prev_value, next_value])
        POL_vaccination_data.iat[i, 1]= substitute
        
# =============================================================================
# Cases data preparation
# =============================================================================

USA_cases_data= USA_data[['date', 'new_cases']]
USA_cases_data= USA_cases_data.reset_index(drop= True)

for column in USA_cases_data.columns:
    print(USA_cases_data[USA_cases_data[column].isna()])

# Since only the 1st row (i.e. start of measurements) contains a NaN value, we're going to assume it is 0

USA_cases_data= USA_cases_data.fillna(0)

# Creating normalised case variables (per million for easy interpretation)
population_USA= USA_data.iat[0,-1]
new_cases_per_million= [value/population_USA*1000000 for value in USA_cases_data['new_cases']]
USA_cases_data['new_cases_per_million']= new_cases_per_million

USA_cases_data= USA_cases_data.merge(USA_vaccination_data, on= 'date')


POL_cases_data= POL_data[['date', 'new_cases']]
POL_cases_data= POL_cases_data.reset_index(drop= True)

for column in POL_cases_data.columns:
    print(POL_cases_data[POL_cases_data[column].isna()])

population_POL= POL_data.iat[0,-1]
new_cases_per_million= [value/population_POL*1000000 for value in POL_cases_data['new_cases']]
POL_cases_data['new_cases_per_million']= new_cases_per_million

POL_cases_data= POL_cases_data.merge(POL_vaccination_data, on= 'date')

# =============================================================================
# Deaths data preparation
# =============================================================================

USA_deaths_data= USA_data[['date', 'total_deaths', 'new_deaths']]
USA_deaths_data= USA_deaths_data.reset_index(drop= True)

for column in USA_deaths_data.columns:
    print(USA_deaths_data[USA_deaths_data[column].isna()])

# Start the dataset at the first non-NaN value, since we cannot know the deaths before this point

first_death= 38
USA_deaths_data= USA_deaths_data.iloc[first_death:]

USA_deaths_data= USA_deaths_data.reset_index(drop= True)

# Creating normalised case variables
# Total deaths per hundred for easy interpretation
total_deaths_per_hundred= [value/population_USA*100 for value in USA_deaths_data['total_deaths']]
USA_deaths_data['total_deaths_per_hundred']= total_deaths_per_hundred
# New deaths per million for easy interpretation
new_deaths_per_million= [value/population_USA*1000000 for value in USA_deaths_data['new_deaths']]
USA_deaths_data['new_deaths_per_million']= new_deaths_per_million

USA_deaths_data= USA_deaths_data.merge(USA_vaccination_data, on= 'date')


POL_deaths_data= POL_data[['date', 'total_deaths', 'new_deaths']]
POL_deaths_data= POL_deaths_data.reset_index(drop= True)

for column in POL_deaths_data.columns:
    print(POL_deaths_data[POL_deaths_data[column].isna()])

# Start the dataset at the first non-NaN value, since we cannot know the deaths before this point

first_death= 8
POL_deaths_data= POL_deaths_data.iloc[first_death:]

# Creating normalised case variables
# Total deaths per hundred for easy interpretation
total_deaths_per_hundred= [value/population_POL*100 for value in POL_deaths_data['total_deaths']]
POL_deaths_data['total_deaths_per_hundred']= total_deaths_per_hundred
# New deaths per million for easy interpretation
new_deaths_per_million= [value/population_POL*1000000 for value in POL_deaths_data['new_deaths']]
POL_deaths_data['new_deaths_per_million']= new_deaths_per_million

POL_deaths_data= POL_deaths_data.merge(POL_vaccination_data, on= 'date')

# =============================================================================
# ICU patients
# =============================================================================

USA_icu_data= USA_data[['date', 'icu_patients']]
USA_icu_data= USA_icu_data.reset_index(drop= True)

for column in USA_icu_data.columns:
    print(USA_icu_data[USA_icu_data[column].isna()])

# Start the dataset at the first non-NaN value, since we cannot know the icu patients before this point
first_icu= 175
# End the dataset at the last non-NaN value, since we cannot know the icu patients after this point
last_icu= 508
USA_icu_data= USA_icu_data.iloc[first_icu:last_icu]

# Creating normalised case variables (per million for easy interpretation)
# Total deaths per hundred for easy interpretation
total_icu_per_million= [value/population_USA*1000000 for value in USA_icu_data['icu_patients']]
USA_icu_data['total_icu_per_million']= total_icu_per_million

USA_icu_data= USA_icu_data.merge(USA_vaccination_data, on= 'date')

POL_icu_data= POL_data[['date', 'icu_patients']]
POL_icu_data= POL_icu_data.reset_index(drop= True)

for column in POL_icu_data.columns:
    print(POL_icu_data[POL_icu_data[column].isna()])

# Start the dataset at the first non-NaN value, since we cannot know the icu patients before this point
first_icu= 174
# End the dataset at the last non-NaN value, since we cannot know the icu patients after this point
last_icu= 508
POL_icu_data= POL_icu_data.iloc[first_icu:last_icu]

# Creating normalised case variables (per million for easy interpretation)
# Total deaths per hundred for easy interpretation
total_icu_per_million= [value/population_POL*1000000 for value in POL_icu_data['icu_patients']]
POL_icu_data['total_icu_per_million']= total_icu_per_million

POL_icu_data= POL_icu_data.merge(POL_vaccination_data, on= 'date')

# =============================================================================
# Reproduction rate
# =============================================================================

USA_R_data= USA_data[['date', 'reproduction_rate']]
USA_R_data= USA_R_data.reset_index(drop= True)

for column in USA_R_data.columns:
    print(USA_R_data[USA_R_data[column].isna()])

# Start the dataset at the first non-NaN value, since we cannot know the icu patients before this point
first_R= 43
# End the dataset at the last non-NaN value, since we cannot know the icu patients after this point
last_R= 510
USA_R_data= USA_R_data.iloc[first_R:last_R]

USA_R_data= USA_R_data.merge(USA_vaccination_data, on= 'date')


POL_R_data= POL_data[['date', 'reproduction_rate']]
POL_R_data= POL_R_data.reset_index(drop= True)

for column in POL_R_data.columns:
    print(POL_R_data[POL_R_data[column].isna()])

# Start the dataset at the first non-NaN value, since we cannot know the icu patients before this point
first_R= 11
# End the dataset at the last non-NaN value, since we cannot know the icu patients after this point
last_R= 469
POL_R_data= POL_R_data.iloc[first_R:last_R]

POL_R_data= POL_R_data.merge(POL_vaccination_data, on= 'date')

print(USA_vaccination_data.to_markdown()) 


Empty DataFrame
Columns: [date, new_cases]
Index: []
         date  new_cases
0  2020-01-22        NaN
Empty DataFrame
Columns: [date, new_cases]
Index: []
Empty DataFrame
Columns: [date, new_cases]
Index: []
Empty DataFrame
Columns: [date, total_deaths, new_deaths]
Index: []
          date  total_deaths  new_deaths
0   2020-01-22           NaN         NaN
1   2020-01-23           NaN         NaN
2   2020-01-24           NaN         NaN
3   2020-01-25           NaN         NaN
4   2020-01-26           NaN         NaN
5   2020-01-27           NaN         NaN
6   2020-01-28           NaN         NaN
7   2020-01-29           NaN         NaN
8   2020-01-30           NaN         NaN
9   2020-01-31           NaN         NaN
10  2020-02-01           NaN         NaN
11  2020-02-02           NaN         NaN
12  2020-02-03           NaN         NaN
13  2020-02-04           NaN         NaN
14  2020-02-05           NaN         NaN
15  2020-02-06           NaN         NaN
16  2020-02-07           N