# Similarity of the Curve Slopes
- **Created by: Andrés Segura Tinoco**
- **Created on: May 19, 2020**
- **Data: Covid 19**

## 1. Read C19 data by country

In [1]:
# # Import libraries
import pandas as pd
import numpy as np
import datetime

In [2]:
# Reading historical data
dataURL = "../data/historical_data.csv"
column_list = ["country", "region", "subregion", "date", "total_cases", "total_deaths", "diff_total_cases", "diff_total_deaths"]
raw_data = pd.read_csv(dataURL, usecols = lambda column : column in column_list)
raw_data

Unnamed: 0,country,region,subregion,date,total_cases,total_deaths,diff_total_cases,diff_total_deaths
0,China,Asia,Eastern Asia,01/22/2020,571,17,0,0
1,Japan,Asia,Eastern Asia,01/22/2020,2,0,0,0
2,China,Asia,Eastern Asia,01/23/2020,830,25,259,8
3,Japan,Asia,Eastern Asia,01/23/2020,2,0,0,0
4,China,Asia,Eastern Asia,01/24/2020,1287,41,457,16
...,...,...,...,...,...,...,...,...
22127,Vietnam,Asia,South-Eastern Asia,05/28/2020,327,0,0,0
22128,Western Sahara,Africa,Northern Africa,05/28/2020,9,1,0,0
22129,Yemen,Asia,Western Asia,05/28/2020,256,53,0,0
22130,Zambia,Africa,Eastern Africa,05/28/2020,1057,7,0,0


In [3]:
# Apply data type quality
raw_data["date"] = pd.to_datetime(raw_data["date"])
raw_data.dtypes

country                      object
region                       object
subregion                    object
date                 datetime64[ns]
total_cases                   int64
total_deaths                  int64
diff_total_cases              int64
diff_total_deaths             int64
dtype: object

## 2. Get Country List

In [4]:
today = pd.Timestamp('today').floor('D')
min_total_cases = 1000
min_deaths = 50
country_data = raw_data[(raw_data["date"] >= today) &
                        (raw_data["total_cases"] >= min_total_cases) &
                        (raw_data["total_deaths"] >= min_deaths)]
country_dict = dict(zip(country_data.country, country_data.region))
len(country_dict)

80

## 3. Calculate Curve Slope by Country

In [5]:
# Calculate the curve slope of each country
def calc_curve_slope(raw_data, country_list, var_name, top_date, norm=False):
    curve_slope = {}

    for country in country_list:
        
        # Filter data by country
        country_fulldata = raw_data[raw_data["country"] == country]
        country_data = country_fulldata[country_fulldata["date"] >= top_date]
        
        # Get x and y values
        y_values = np.array(country_data[var_name])
        x_values = np.arange(0, len(y_values))
        
        # Normalize curves
        if norm:
            y_max = max(np.array(country_fulldata[var_name]))
            if y_max > 0:
                y_values = y_values / y_max
            else:
                print('Error with country %s, max value is zero for %s.' % (country, var_name))
        
        # Calculate curve slope
        X = x_values - x_values.mean()
        Y = y_values - y_values.mean()
        slope = (X.dot(Y)) / (X.dot(X))
        
        curve_slope[country] = slope
    
    return curve_slope

In [6]:
# Filtering data
x_var_name = "total_cases"
y_var_name = "total_deaths"
norm = False
last_days = 7
top_date = datetime.datetime.today() - datetime.timedelta(days=last_days)
top_date

datetime.datetime(2020, 5, 21, 10, 19, 4, 846649)

In [7]:
# Calculate X and Y slope by country
x_data_slope = calc_curve_slope(raw_data, country_dict.keys(), x_var_name, top_date, norm)
y_data_slope = calc_curve_slope(raw_data, country_dict.keys(), y_var_name, top_date, norm)

## 4. Select Top N by Variable

In [8]:
# Filter X variable
top_country = 10
output = sorted(x_data_slope, key=x_data_slope.get, reverse=True)[:top_country]

In [9]:
# Show Top 10 countries with the highest Curve Slope
print('country, curve_slope')
for country in output:
    print('%s, %s' % (country, x_data_slope[country]))

country, curve_slope
USA, 18238.0
Brazil, 14543.82142857143
Russia, 8759.5
India, 6089.464285714285
Peru, 4382.678571428572
Chile, 3712.3928571428573
Mexico, 2741.5
UK, 2320.535714285714
Saudi Arabia, 2083.0
Iran, 2019.3214285714287


In [10]:
# Filter Y variable
output = sorted(y_data_slope, key=y_data_slope.get, reverse=True)[:top_country]

In [11]:
# Show Top 10 countries with the highest Curve Slope
print('country, curve_slope')
for country in output:
    print('%s, %s' % (country, y_data_slope[country]))

country, curve_slope
Brazil, 809.0714285714286
USA, 782.9642857142857
Mexico, 300.0
UK, 179.5
India, 148.35714285714286
Russia, 146.60714285714286
Peru, 134.60714285714286
Canada, 95.78571428571429
Italy, 79.0
France, 57.57142857142857


## 5. Select Top N by Quadrant

In [12]:
# Quadrant variables
top_country = 10
x_median = np.median(list(x_data_slope.values()))
y_median = np.median(list(y_data_slope.values()))
x_median, y_median

(208.48214285714283, 5.839285714285714)

In [13]:
# Showing data
for country, region in country_dict.items():
    x = x_data_slope[country]
    y = y_data_slope[country]
    if norm or (x > 0 and y > 0):
        print("%s,%s,%.4f,%.4f" % (country, region, x, y))

Afghanistan,Asia,629.4643,4.0714
Algeria,Africa,167.7143,7.2143
Argentina,Americas,577.2857,12.2500
Armenia,Asia,376.7500,6.0357
Australia,Oceania,7.9643,0.2143
Austria,Europe,30.0000,4.0714
Azerbaijan,Asia,128.2857,1.3214
Bangladesh,Asia,1639.8929,21.6786
Belarus,Europe,926.7857,4.8571
Belgium,Europe,212.1786,29.8571
Bolivia,Americas,398.0714,9.0714
Bosnia and Herzegovina,Europe,13.3214,2.1786
Brazil,Americas,14543.8214,809.0714
Bulgaria,Europe,15.5357,1.4643
Cameroon,Africa,204.5000,3.2143
Canada,Americas,928.9286,95.7857
Chile,Americas,3712.3929,37.7500
Colombia,Americas,878.6071,21.7143
Croatia,Europe,0.2857,0.5357
Cuba,Americas,11.0357,0.1786
Czechia,Europe,48.4643,0.8214
Denmark,Europe,46.2857,1.0714
Dominican Republic,Americas,295.2500,3.4286
DRC,Africa,123.1786,1.1786
Ecuador,Americas,396.9286,39.6429
Egypt,Africa,694.1786,18.6429
Estonia,Europe,6.4643,0.3929
Finland,Europe,32.6786,1.4286
France,Europe,111.0000,57.5714
Germany,Europe,431.7500,37.4643
Greece,Europe,5.5357,0.6429

<hr>
<p><a href="https://github.com/ansegura7/WebScraping_Covid19">« Home</a></p>