# Covid 19 Curve Slope 
- **Created by Andrés Segura Tinoco**
- **Created on May 19, 2020**

## 1. Read C19 data by country

In [1]:
# # Import libraries
import pandas as pd
import numpy as np
import datetime

In [2]:
# Reading historical data
dataURL = "../data/historical_data.csv"
column_list = ["country", "region", "subregion", "date", "total_cases", "total_deaths", "active_cases", "total_tests"]
raw_data = pd.read_csv(dataURL, usecols = lambda column : column in column_list)
raw_data

Unnamed: 0,country,region,subregion,date,total_cases,total_deaths,active_cases,total_tests
0,China,Asia,Eastern Asia,01/22/2020,571,17,554,0
1,Japan,Asia,Eastern Asia,01/22/2020,2,0,2,0
2,China,Asia,Eastern Asia,01/23/2020,830,25,771,0
3,Japan,Asia,Eastern Asia,01/23/2020,2,0,2,0
4,China,Asia,Eastern Asia,01/24/2020,1287,41,1208,0
...,...,...,...,...,...,...,...,...
20423,Vietnam,Asia,South-Eastern Asia,05/20/2020,324,0,61,275000
20424,Western Sahara,Africa,Northern Africa,05/20/2020,6,0,0,0
20425,Yemen,Asia,Western Asia,05/20/2020,167,28,134,120
20426,Zambia,Africa,Eastern Africa,05/20/2020,832,7,628,19014


In [3]:
# Apply data type quality
raw_data["date"] = pd.to_datetime(raw_data["date"])
raw_data.dtypes

country                 object
region                  object
subregion               object
date            datetime64[ns]
total_cases              int64
total_deaths             int64
active_cases             int64
total_tests              int64
dtype: object

## 2. Get Country List

In [4]:
today = pd.Timestamp('today').floor('D')
min_total_cases = 1000
min_deaths = 50
country_data = raw_data[(raw_data["date"] >= today) 
                        & (raw_data["total_cases"] >= min_total_cases)
                        & (raw_data["total_deaths"] >= min_deaths)]
country_dict = dict(zip(country_data.country, country_data.region))
len(country_dict)

77

## 3. Calculate Curve Slope by Country

In [5]:
# Filtering data
x_var_name = "total_cases"
y_var_name = "total_deaths"
last_days = 7
top_date = datetime.datetime.today() - datetime.timedelta(days=last_days)
top_date

datetime.datetime(2020, 5, 13, 9, 56, 57, 983450)

In [6]:
# Calculate the curve slope of each country
def calc_curve_slope(raw_data, country_list, var_name, top_date):
    curve_slope = {}

    for country in country_list:
        
        # Filter data by country
        country_data = raw_data[(raw_data["country"] == country) & (raw_data["date"] >= top_date)]

        # Get x and y values
        y_values = np.array(country_data[var_name])
        x_values = np.arange(0, len(y_values))

        # Calculate curve slope
        X = x_values - x_values.mean()
        Y = y_values - y_values.mean()
        slope = (X.dot(Y)) / (X.dot(X))

        curve_slope[country] = slope
    
    return curve_slope

In [7]:
# Calculate X and Y slope by country
x_data_slope = calc_curve_slope(raw_data, country_dict.keys(), x_var_name, top_date)
y_data_slope = calc_curve_slope(raw_data, country_dict.keys(), y_var_name, top_date)

## 4. Select Top N by Variable

In [8]:
# Filter X variable
top_country = 10
output = sorted(x_data_slope, key=x_data_slope.get, reverse=True)[:top_country]

In [9]:
# Show Top 10 countries with the highest Curve Slope
print('country, curve_slope')
for country in output:
    print('%s, %s' % (country, x_data_slope[country]))

country, curve_slope
USA, 19952.85714285714
Brazil, 11828.75
Russia, 9364.67857142857
India, 4617.785714285715
Peru, 3321.6071428571427
UK, 2766.4285714285716
Saudi Arabia, 2632.6071428571427
Chile, 2225.785714285714
Mexico, 2084.6428571428573
Iran, 2045.857142857143


In [10]:
# Filter Y variable
output = sorted(y_data_slope, key=y_data_slope.get, reverse=True)[:top_country]

In [11]:
# Show Top 10 countries with the highest Curve Slope
print('country, curve_slope')
for country in output:
    print('%s, %s' % (country, y_data_slope[country]))

country, curve_slope
USA, 1141.642857142857
Brazil, 685.1785714285714
UK, 292.75
Mexico, 201.85714285714286
Italy, 134.46428571428572
India, 121.25
France, 121.10714285714286
Peru, 116.10714285714286
Russia, 108.0
Canada, 77.96428571428571


## 5. Select Top N by Quadrant

In [12]:
# Quadrant variables
top_country = 10
x_median = np.median(list(x_data_slope.values()))
y_median = np.median(list(y_data_slope.values()))
x_median, y_median

(187.85714285714286, 5.678571428571429)

In [13]:
# Showing data
for country, region in country_dict.items():
    x = x_data_slope[country]
    y = y_data_slope[country]
    if x > 0 and y > 0:
        print("%s,%s,%.2f,%.2f" % (country, region, x, y))

Afghanistan,Asia,406.71,7.43
Algeria,Africa,167.18,5.68
Argentina,Americas,294.68,7.71
Armenia,Asia,241.68,3.00
Australia,Oceania,13.64,0.39
Austria,Europe,49.18,1.04
Bangladesh,Asia,1307.57,17.43
Belarus,Europe,943.18,4.75
Belgium,Europe,283.89,39.79
Bolivia,Americas,223.00,7.14
Bosnia and Herzegovina,Europe,20.25,2.07
Brazil,Americas,11828.75,685.18
Bulgaria,Europe,31.36,2.71
Cameroon,Africa,107.04,0.11
Canada,Americas,1012.11,77.96
Chile,Americas,2225.79,25.36
China,Asia,5.61,0.21
Colombia,Americas,598.89,15.29
Croatia,Europe,2.25,0.29
Czechia,Europe,57.46,1.61
Denmark,Europe,65.29,3.00
Dominican Republic,Americas,331.86,3.46
DRC,Africa,79.00,1.96
Ecuador,Americas,611.93,75.14
Egypt,Africa,482.93,15.39
Estonia,Europe,6.14,0.32
Finland,Europe,47.50,2.50
France,Europe,320.89,121.11
Germany,Europe,512.14,47.25
Greece,Europe,10.25,1.43
Honduras,Americas,118.68,3.79
Hungary,Europe,35.50,5.93
India,Asia,4617.79,121.25
Indonesia,Asia,519.07,35.32
Iran,Asia,2045.86,55.04
Iraq,Asia,90.50,2.9

<hr>
<p><a href="https://github.com/ansegura7/WebScraping_Covid19">« Home</a></p>