### Import libraries

In [None]:
import pandas as pd
import numpy as np

### Load csv data from git repo

Clone the official Johns Hopkins University git repository with data from here: https://github.com/CSSEGISandData/COVID-19 	
**Pay attention where your Johns Hopkins data is located and modify the path accordingly.**

In [None]:
df = pd.read_csv("johnshopkins_data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
df.head()

### Import plot libs and chart Germany and Italy cases starting from 1/22/2020

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set()

plt.plot(df[df['Country/Region'] == 'Italy'].iloc[0, 4:], '-', label='Cases')
plt.plot(df[df['Country/Region'] == 'Germany'].iloc[0, 4:], '-', label='Cases')


### Select Europe by coordinates, plot all European countries from 1/22/2020

In [None]:
europe = df[(df['Lat'] > 30) & (df['Long'] > -10) & (df['Long'] < 40)]
europe = europe.groupby(['Country/Region']).sum()
n_countries = len(europe)

europe.reset_index(inplace=True)

plt.figure(figsize=(10,5))

x = range(0, len(europe.iloc[0, 3:]))

for i in range(n_countries):
    plt.plot(x, europe[europe.columns[3:]].loc[i])
    

plt.title('COVID-19')
plt.xlabel('Days from 1/22/20')
plt.ylabel('Cases')


### Plot all European countries on logarithmic scale

In [None]:
fig = plt.figure(figsize=(15,25))
myplot = fig.add_subplot(2, 1, 1)
myplot.set_yscale('log')

x = range(0, len(europe.iloc[0, 3:]))

for i in range(n_countries):
    myplot.plot(x, europe[europe.columns[3:]].loc[i])
    

plt.title('COVID-19')
plt.xlabel('Days from 1/22/20')
plt.ylabel('Cases')


### Tanspose the matrix

In [None]:
europeT = europe.T

In [None]:
europeT.head(8)

### Drop coordinates

In [None]:
europeT.drop('Lat', inplace=True)
europeT.drop('Long', inplace=True)


In [None]:
europeT.head()

### Reset index

In [None]:
europeT.reset_index(inplace=True)

In [None]:
europeT.head()

### Set the first row with countries as columns' names

In [None]:
europeT.columns = europeT.loc[0]

In [None]:
europeT.head()

### Drop the first row with countries

In [None]:
europeT.drop(0, inplace=True)

In [None]:
europeT.head()

### Drop 'Country/Region' column

In [None]:
europeT.drop(columns=['Country/Region'], inplace=True)

In [None]:
europeT.head()

### Generate 30% growth trendline
We do this to have a reference 30% growth trendline in the chart. We will use it for visual reference when analyzing cases by country.

In [None]:
start = 50
growth_rate = 30 # %
trendline = [start]
for i in range(len(europeT) - 1):
    trendline.append(trendline[i] * (100 + growth_rate)/100)

### Plot all European countries on logarithmic scale, marking Germany with dots. 0 on X axis corresponds to the first 50 recorded cases in the country.

In [None]:
fig = plt.figure(figsize=(15,25))
myplot = fig.add_subplot(2, 1, 1)
myplot.set_yscale('log')

min_reported_cases = 50

x = range(0, len(europeT))

for country in europeT:
    country_surged = europeT[europeT[country] > min_reported_cases][country]
    zz = pd.Series(np.full(len(europeT[country]) - len(country_surged), None))
    ch = '-'
    if country == 'Germany':
        ch = 'o'
    if country == 'Italy':
        ch = '.'
    myplot.plot(x, country_surged.append(zz), ch, label=country)
    
myplot.plot(trendline, 'x', label="30% trendline")
    
plt.title('COVID-19')
plt.xlabel(f'Days from the first {min_reported_cases}')
plt.ylabel('Cases')
plt.legend(loc='lower right')

### Plot case growth in Germany from the first case

In [None]:
fig = plt.figure(figsize=(15,25))
myplot = fig.add_subplot(2, 1, 1)
myplot.set_yscale('log')

x = range(0, len(europeT))

for country in europeT:
    country_surged = europeT[europeT[country] > 0][country]
    zz = pd.Series(np.full(len(europeT[country]) - len(country_surged), None))
    ch = '-'
    if country == 'Germany':
        ch = 'o'
        myplot.plot(x, country_surged.append(zz), ch, label=country)
    
myplot.plot(trendline, 'x', label="30% trendline")
    
plt.title('COVID-19')
plt.xlabel('Days from first registered case')
plt.ylabel('Cases')
plt.legend(loc='lower right')

## Lockdown forecast based on ratio of population to reported cases

For further steps we will need data on population in countries worldwide. Download it from UN website from the following link:
https://population.un.org/wpp/Download/Standard/CSV/ 	
We need "Total Population All variants (CSV, 21.35 MB)" file. Direct link to file: https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2019_TotalPopulationBySex.csv 	
Modify the path to the file accordingly.

In [None]:
df_pop = pd.read_csv("WPP2019_TotalPopulationBySex.csv") # UN CSV path
df_pop.head()

### Data cleaning	
We drop everything except the latest population.

In [None]:
df_pop.drop(columns=['VarID', 'PopFemale', 'PopMale', 'Variant', 'LocID', 'PopDensity', 'MidPeriod'], inplace=True)
df_pop.head()

In [None]:
df_pop = df_pop[df_pop.Time == 2020]

We aggregate different regions of the same country and different estimations to one. Our goal is: One country - One number. We use Median as an aggregation function.

In [None]:
df_pop = df_pop.groupby(['Location']).median()

In [None]:
df_pop.drop(columns=['Time'], inplace=True)

In [None]:
df_pop.head()

### Define baseline when a total country lockdown (curfew) was implemented in Italy.	
This will give us a ratio of registered COVID-19 cases to population in the country.

In [None]:
from datetime import datetime, timedelta
# 2/20/2020 - 50+ cases
# 3/10/2020 - lockdown nationwide, with 10149 cases

baseline_cases = 10149
baseline_date = datetime.strptime('3/10/2020', '%m/%d/%Y')

baseline_ratio = (baseline_cases / df_pop.loc['Italy'])[0]

Assuming that governments across the world behave similarly (sometimes they don't), we can estimate when another country will get similar restrictions as in Italy.	
	
Select the last known date for the country as "today" 
```python
today = europeT[country_name].iloc[-1]
```
Identify the current ratio of registered cases to population. If you get an error this is probably because the country you specified is called differently in UN dataset. For example, China could be called People's Republic of China.
```python
ratio = (today / df_pop.loc[country_name])[0]
```
We take the last known date from the dataset for the selected country
```python
lockdown_date = datetime.strptime(df.columns[-1], '%m/%d/%y')
```
Further we update our variables as long as the ratio stays smaller than in Italy on the day of lockdown. The variable ```today``` is an extrapolation of 30% growth.

In [None]:
country_name = 'Germany'
today = europeT[country_name].iloc[-1]
ratio = (today / df_pop.loc[country_name])[0]
lockdown_date = datetime.strptime(df.columns[-1], '%m/%d/%y')


while ratio < baseline_ratio:
    today = today * (100 + growth_rate)/100
    ratio = (today / df_pop.loc[country_name])[0]
    lockdown_date += timedelta(days=1)
    print(lockdown_date.date(), today)


Our result. If you don't get any or you get yesterday, it might be because the lockdown date is already in the past. Analyze the results you've got from the loop above.

In [None]:
print(f'Lockdown date for {country_name}', lockdown_date.date())