# Similarity of the Curve Slopes
- **Created by: Andrés Segura Tinoco**
- **Created on: May 19, 2020**
- **Data: Covid 19**

## 1. Read C19 data by country

In [1]:
# # Import libraries
import pandas as pd
import numpy as np
import datetime

In [2]:
# Reading historical data
dataURL = "../data/historical_data.csv"
column_list = ["country", "region", "subregion", "date", "total_cases", "total_deaths", "diff_total_cases", "diff_total_deaths"]
raw_data = pd.read_csv(dataURL, usecols = lambda column : column in column_list)
raw_data

Unnamed: 0,country,region,subregion,date,total_cases,total_deaths,diff_total_cases,diff_total_deaths
0,China,Asia,Eastern Asia,01/22/2020,571,17,0,0
1,Japan,Asia,Eastern Asia,01/22/2020,2,0,0,0
2,China,Asia,Eastern Asia,01/23/2020,830,25,259,8
3,Japan,Asia,Eastern Asia,01/23/2020,2,0,0,0
4,China,Asia,Eastern Asia,01/24/2020,1287,41,457,16
...,...,...,...,...,...,...,...,...
20636,Vietnam,Asia,South-Eastern Asia,05/21/2020,324,0,0,0
20637,Western Sahara,Africa,Northern Africa,05/21/2020,6,0,0,0
20638,Yemen,Asia,Western Asia,05/21/2020,184,30,0,0
20639,Zambia,Africa,Eastern Africa,05/21/2020,832,7,0,0


In [3]:
# Apply data type quality
raw_data["date"] = pd.to_datetime(raw_data["date"])
raw_data.dtypes

country                      object
region                       object
subregion                    object
date                 datetime64[ns]
total_cases                   int64
total_deaths                  int64
diff_total_cases              int64
diff_total_deaths             int64
dtype: object

## 2. Get Country List

In [4]:
today = pd.Timestamp('today').floor('D')
min_total_cases = 1000
min_deaths = 50
country_data = raw_data[(raw_data["date"] >= today) 
                        & (raw_data["total_cases"] >= min_total_cases)
                        & (raw_data["total_deaths"] >= min_deaths)]
country_dict = dict(zip(country_data.country, country_data.region))
len(country_dict)

77

## 3. Calculate Curve Slope by Country

In [5]:
# Filtering data
x_var_name = "total_cases"
y_var_name = "total_deaths"
last_days = 7
top_date = datetime.datetime.today() - datetime.timedelta(days=last_days)
top_date

datetime.datetime(2020, 5, 14, 9, 33, 48, 683704)

In [6]:
# Calculate the curve slope of each country
def calc_curve_slope(raw_data, country_list, var_name, top_date):
    curve_slope = {}

    for country in country_list:
        
        # Filter data by country
        country_data = raw_data[(raw_data["country"] == country) & (raw_data["date"] >= top_date)]

        # Get x and y values
        y_values = np.array(country_data[var_name])
        x_values = np.arange(0, len(y_values))

        # Calculate curve slope
        X = x_values - x_values.mean()
        Y = y_values - y_values.mean()
        slope = (X.dot(Y)) / (X.dot(X))

        curve_slope[country] = slope
    
    return curve_slope

In [7]:
# Calculate X and Y slope by country
x_data_slope = calc_curve_slope(raw_data, country_dict.keys(), x_var_name, top_date)
y_data_slope = calc_curve_slope(raw_data, country_dict.keys(), y_var_name, top_date)

## 4. Select Top N by Variable

In [8]:
# Filter X variable
top_country = 10
output = sorted(x_data_slope, key=x_data_slope.get, reverse=True)[:top_country]

In [9]:
# Show Top 10 countries with the highest Curve Slope
print('country, curve_slope')
for country in output:
    print('%s, %s' % (country, x_data_slope[country]))

country, curve_slope
USA, 19088.321428571428
Brazil, 13203.392857142857
Russia, 8182.107142857143
India, 4697.071428571428
Peru, 3455.1071428571427
Chile, 2585.75
Saudi Arabia, 2366.6785714285716
Mexico, 2096.8928571428573
UK, 2004.75
Iran, 1873.607142857143


In [10]:
# Filter Y variable
output = sorted(y_data_slope, key=y_data_slope.get, reverse=True)[:top_country]

In [11]:
# Show Top 10 countries with the highest Curve Slope
print('country, curve_slope')
for country in output:
    print('%s, %s' % (country, y_data_slope[country]))

country, curve_slope
USA, 1134.9285714285713
Brazil, 718.75
UK, 296.39285714285717
Mexico, 233.85714285714286
Italy, 126.96428571428571
India, 123.03571428571429
Peru, 113.0
Russia, 97.78571428571429
France, 97.75
Canada, 80.03571428571429


## 5. Select Top N by Quadrant

In [12]:
# Quadrant variables
top_country = 10
x_median = np.median(list(x_data_slope.values()))
y_median = np.median(list(y_data_slope.values()))
x_median, y_median

(162.10714285714286, 5.142857142857143)

In [13]:
# Showing data
for country, region in country_dict.items():
    x = x_data_slope[country]
    y = y_data_slope[country]
    if x > 0 and y > 0:
        print("%s,%s,%.2f,%.2f" % (country, region, x, y))

Afghanistan,Asia,383.96,5.32
Algeria,Africa,162.11,5.75
Argentina,Americas,325.32,8.61
Armenia,Asia,222.36,2.61
Australia,Oceania,8.46,0.39
Austria,Europe,39.82,0.93
Bangladesh,Asia,1227.07,16.07
Belarus,Europe,837.00,4.18
Belgium,Europe,232.71,32.82
Bolivia,Americas,235.89,6.89
Bosnia and Herzegovina,Europe,17.11,1.39
Brazil,Americas,13203.39,718.75
Bulgaria,Europe,26.57,2.43
Cameroon,Africa,127.29,1.07
Canada,Americas,973.32,80.04
Chile,Americas,2585.75,26.96
China,Asia,4.61,0.18
Colombia,Americas,616.79,15.25
Croatia,Europe,2.21,0.21
Czechia,Europe,58.89,1.68
Denmark,Europe,57.61,2.75
Dominican Republic,Americas,316.32,4.11
DRC,Africa,72.32,1.18
Ecuador,Americas,546.86,49.46
Egypt,Africa,545.64,15.32
Estonia,Europe,5.32,0.21
Finland,Europe,36.11,1.79
France,Europe,423.82,97.75
Germany,Europe,508.57,51.32
Greece,Europe,6.71,1.00
Honduras,Americas,91.93,2.21
Hungary,Europe,30.00,5.14
India,Asia,4697.07,123.04
Indonesia,Asia,478.18,31.32
Iran,Asia,1873.61,52.36
Iraq,Asia,97.43,3.04
Ire

<hr>
<p><a href="https://github.com/ansegura7/WebScraping_Covid19">« Home</a></p>