In [25]:
import os
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import sys

# Upload Data

In [26]:
sys.path.append('./')

In [37]:
PATH_ROOT_INPUT_DATA = r'data\input\COVID-19-master\csse_covid_19_data\csse_covid_19_daily_reports'
PATH_ROOT_INTERIM_DATA = 'data\\interim'
files = []
for r, d, f in os.walk(PATH_ROOT_INPUT_DATA):
    for file in f:
        if '.csv' in file:
            files.append(os.path.join(r, file))

dfs_list = []
dfs_after_list = []
for file in files:
    date = file.split("\\")[-1]
    df = pd.read_csv(os.path.join(file), sep=",")
    df["Date"] = date.split(".")[0]
    if "Country_Region" in df.keys():
        dfs_after_list.append(df)
    else:
        dfs_list.append(df)
        
covid19_data_type_1 = pd.concat(dfs_list, axis=0, ignore_index=True, sort=True)
covid19_data_type_2 = pd.concat(dfs_after_list, axis=0, ignore_index=True, sort=True)

# Type 1 Data

In [38]:
covid19_data_type_1.sort_values("Last Update").head()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
0,1.0,Mainland China,01-22-2020,,1/22/2020 17:00,,,Anhui,
21,1.0,Mainland China,01-22-2020,,1/22/2020 17:00,,,Ningxia,
22,,Mainland China,01-22-2020,,1/22/2020 17:00,,,Qinghai,
23,,Mainland China,01-22-2020,,1/22/2020 17:00,,,Shaanxi,
24,2.0,Mainland China,01-22-2020,,1/22/2020 17:00,,,Shandong,


In [39]:
covid19_data_type_1.sort_values("Last Update").tail()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
7361,525.0,US,03-21-2020,1.0,2020-03-21T23:13:18,42.2302,-71.5301,Massachusetts,0.0
7352,659.0,US,03-21-2020,13.0,2020-03-21T23:13:18,27.7663,-81.6868,Florida,0.0
7584,1.0,Cape Verde,03-21-2020,0.0,2020-03-21T23:43:02,15.1111,-23.6167,,0.0
7601,1.0,Uganda,03-21-2020,0.0,2020-03-21T23:43:02,1.0,32.0,,0.0
7597,1.0,Papua New Guinea,03-21-2020,0.0,2020-03-21T23:43:02,-6.315,143.9555,,0.0


In [40]:
covid19_data_type_1["Last Update"] = pd.to_datetime(covid19_data_type_1["Last Update"])

In [41]:
covid19_data_type_1[covid19_data_type_1["Country/Region"] == "Mainland China"].sort_values("Date")

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
0,1.0,Mainland China,01-22-2020,,2020-01-22 17:00:00,,,Anhui,
34,10.0,Mainland China,01-22-2020,,2020-01-22 17:00:00,,,Zhejiang,
33,1.0,Mainland China,01-22-2020,,2020-01-22 17:00:00,,,Yunnan,
32,,Mainland China,01-22-2020,,2020-01-22 17:00:00,,,Xinjiang,
30,,Mainland China,01-22-2020,,2020-01-22 17:00:00,,,Tibet,
...,...,...,...,...,...,...,...,...,...
4525,935.0,Mainland China,03-10-2020,1.0,2020-03-10 01:33:02,27.6140,115.7221,Jiangxi,927.0
4925,0.0,Mainland China,03-11-2020,0.0,2020-03-11 02:18:28,36.0611,103.8343,Gansu,0.0
4926,0.0,Mainland China,03-11-2020,0.0,2020-03-11 02:18:29,38.0428,114.5149,Hebei,0.0
5146,0.0,Mainland China,03-12-2020,0.0,2020-03-11 02:18:28,36.0611,103.8343,Gansu,0.0


In [42]:
covid19_data_type_1[covid19_data_type_1["Country/Region"] == "China"].sort_values("Date")

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
4719,67773.0,China,03-11-2020,3046.0,2020-03-11 10:53:02,30.9756,112.2707,Hubei,49134.0
4902,1.0,China,03-11-2020,0.0,2020-03-11 02:18:14,31.6927,88.0924,Tibet,1.0
4834,10.0,China,03-11-2020,0.0,2020-03-11 18:52:03,22.1667,113.5500,Macau,10.0
4819,18.0,China,03-11-2020,0.0,2020-03-11 02:18:14,35.7452,95.9956,Qinghai,18.0
4776,75.0,China,03-11-2020,0.0,2020-03-11 09:33:12,37.2692,106.1655,Ningxia,72.0
...,...,...,...,...,...,...,...,...,...
7365,504.0,China,03-21-2020,8.0,2020-03-21 05:13:04,40.1824,116.4142,Beijing,396.0
7366,484.0,China,03-21-2020,13.0,2020-03-20 02:13:46,47.8620,127.7615,Heilongjiang,463.0
7378,380.0,China,03-21-2020,3.0,2020-03-21 04:43:06,31.2020,121.4491,Shanghai,327.0
7342,935.0,China,03-21-2020,1.0,2020-03-12 02:13:04,27.6140,115.7221,Jiangxi,934.0


## they are the same, but they changed the name

In [43]:
covid19_data_type_1.replace({"Mainland China": "China"}, inplace=True)

In [44]:
covid19_data_type_1["Confirmed"].fillna(0, inplace=True)
covid19_data_type_1["Deaths"].fillna(0, inplace=True)
covid19_data_type_1["Recovered"].fillna(0, inplace=True)

In [45]:
covid19_data_type_1.head()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
0,1.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Anhui,0.0
1,14.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Beijing,0.0
2,6.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Chongqing,0.0
3,1.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Fujian,0.0
4,0.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Gansu,0.0


In [46]:
covid19_data_type_1.sort_values("Date")

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
0,1.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Anhui,0.0
21,1.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Ningxia,0.0
22,0.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Qinghai,0.0
23,0.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Shaanxi,0.0
24,2.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Shandong,0.0
...,...,...,...,...,...,...,...,...,...
7410,178.0,Slovakia,03-21-2020,1.0,2020-03-21 20:43:02,48.6690,19.6990,,0.0
7409,181.0,Canada,03-21-2020,5.0,2020-03-21 21:13:30,52.9399,-73.5491,Quebec,0.0
7408,187.0,Lebanon,03-21-2020,4.0,2020-03-21 12:13:19,33.8547,35.8623,,4.0
7415,168.0,China,03-21-2020,6.0,2020-03-16 14:38:45,19.1959,109.7453,Hainan,161.0


# Type 2 Data

In [47]:
covid19_data_type_2.sort_values("Date")

Unnamed: 0,Active,Admin2,Combined_Key,Confirmed,Country_Region,Date,Deaths,FIPS,Last_Update,Lat,Long_,Province_State,Recovered
0,0,New York City,"New York City, New York, US",9654,US,03-22-2020,63,36061.0,3/22/20 23:45,40.767273,-73.971526,New York,0
2272,0,Otoe,"Otoe, Nebraska, US",0,US,03-22-2020,0,31131.0,3/22/20 23:45,40.648436,-96.133741,Nebraska,0
2273,0,Pawnee,"Pawnee, Nebraska, US",0,US,03-22-2020,0,31133.0,3/22/20 23:45,40.131411,-96.237054,Nebraska,0
2274,0,Perkins,"Perkins, Nebraska, US",0,US,03-22-2020,0,31135.0,3/22/20 23:45,40.850825,-101.650294,Nebraska,0
2275,0,Phelps,"Phelps, Nebraska, US",0,US,03-22-2020,0,31137.0,3/22/20 23:45,40.511560,-99.414617,Nebraska,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47316,0,Hancock,"Hancock, Mississippi, US",30,US,04-06-2020,1,28045.0,2020-04-06 23:22:15,30.418302,-89.488510,Mississippi,0
47317,0,Hancock,"Hancock, Ohio, US",14,US,04-06-2020,0,39063.0,2020-04-06 23:22:15,41.002505,-83.668389,Ohio,0
47318,0,Hancock,"Hancock, Tennessee, US",0,US,04-06-2020,0,47067.0,2020-04-06 23:22:15,36.526929,-83.223758,Tennessee,0
47310,0,Hampton,"Hampton, South Carolina, US",3,US,04-06-2020,0,45049.0,2020-04-06 23:22:15,32.774196,-81.138456,South Carolina,0


In [48]:
covid19_data_type_2.head()

Unnamed: 0,Active,Admin2,Combined_Key,Confirmed,Country_Region,Date,Deaths,FIPS,Last_Update,Lat,Long_,Province_State,Recovered
0,0,New York City,"New York City, New York, US",9654,US,03-22-2020,63,36061.0,3/22/20 23:45,40.767273,-73.971526,New York,0
1,0,Nassau,"Nassau, New York, US",1900,US,03-22-2020,4,36059.0,3/22/20 23:45,40.740665,-73.589419,New York,0
2,0,Westchester,"Westchester, New York, US",1873,US,03-22-2020,0,36119.0,3/22/20 23:45,41.162784,-73.757417,New York,0
3,0,Suffolk,"Suffolk, New York, US",1034,US,03-22-2020,9,36103.0,3/22/20 23:45,40.883201,-72.801217,New York,0
4,0,Rockland,"Rockland, New York, US",455,US,03-22-2020,1,36087.0,3/22/20 23:45,41.150279,-74.025605,New York,0


In [49]:
covid19_data_type_2.drop(['FIPS', 'Admin2', 'Combined_Key', 'Active'], inplace=True, axis=1)

In [50]:
covid19_data_type_2.rename(columns={"Country_Region":"Country/Region",
                                    "Province_State":"Province/State",
                                    "Lat":"Latitude", "Long_":"Longitude",
                                    "Last_Update": "Last Update"},
                           inplace=True)

In [51]:
covid19_data_type_2.head()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
0,9654,US,03-22-2020,63,3/22/20 23:45,40.767273,-73.971526,New York,0
1,1900,US,03-22-2020,4,3/22/20 23:45,40.740665,-73.589419,New York,0
2,1873,US,03-22-2020,0,3/22/20 23:45,41.162784,-73.757417,New York,0
3,1034,US,03-22-2020,9,3/22/20 23:45,40.883201,-72.801217,New York,0
4,455,US,03-22-2020,1,3/22/20 23:45,41.150279,-74.025605,New York,0


In [52]:
covid19_data_type_2.tail()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
49182,254,West Bank and Gaza,04-06-2020,1,2020-04-06 23:21:55,31.9522,35.2332,,24
49183,4,Western Sahara,04-06-2020,0,2020-04-06 23:21:55,24.2155,-12.8858,,0
49184,39,Zambia,04-06-2020,1,2020-04-06 23:21:55,-13.133897,27.849332,,5
49185,10,Zimbabwe,04-06-2020,1,2020-04-06 23:21:55,-19.015438,29.154857,,0
49186,0,US,04-06-2020,0,2020-04-06 23:22:15,,,Wyoming,0


In [53]:
covid19_data_type_2["Date"] = pd.to_datetime(covid19_data_type_2["Date"])

In [54]:
covid19_data_type_2.sort_values("Date").head()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
0,9654,US,2020-03-22,63,3/22/20 23:45,40.767273,-73.971526,New York,0
2272,0,US,2020-03-22,0,3/22/20 23:45,40.648436,-96.133741,Nebraska,0
2273,0,US,2020-03-22,0,3/22/20 23:45,40.131411,-96.237054,Nebraska,0
2274,0,US,2020-03-22,0,3/22/20 23:45,40.850825,-101.650294,Nebraska,0
2275,0,US,2020-03-22,0,3/22/20 23:45,40.51156,-99.414617,Nebraska,0


# Starting Concat

In [55]:
covid19_data_type_1.sort_values("Date")

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
0,1.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Anhui,0.0
21,1.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Ningxia,0.0
22,0.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Qinghai,0.0
23,0.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Shaanxi,0.0
24,2.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Shandong,0.0
...,...,...,...,...,...,...,...,...,...
7410,178.0,Slovakia,03-21-2020,1.0,2020-03-21 20:43:02,48.6690,19.6990,,0.0
7409,181.0,Canada,03-21-2020,5.0,2020-03-21 21:13:30,52.9399,-73.5491,Quebec,0.0
7408,187.0,Lebanon,03-21-2020,4.0,2020-03-21 12:13:19,33.8547,35.8623,,4.0
7415,168.0,China,03-21-2020,6.0,2020-03-16 14:38:45,19.1959,109.7453,Hainan,161.0


In [56]:
covid19_data_type_2.sort_values("Date")

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
0,9654,US,2020-03-22,63,3/22/20 23:45,40.767273,-73.971526,New York,0
2272,0,US,2020-03-22,0,3/22/20 23:45,40.648436,-96.133741,Nebraska,0
2273,0,US,2020-03-22,0,3/22/20 23:45,40.131411,-96.237054,Nebraska,0
2274,0,US,2020-03-22,0,3/22/20 23:45,40.850825,-101.650294,Nebraska,0
2275,0,US,2020-03-22,0,3/22/20 23:45,40.511560,-99.414617,Nebraska,0
...,...,...,...,...,...,...,...,...,...
47316,30,US,2020-04-06,1,2020-04-06 23:22:15,30.418302,-89.488510,Mississippi,0
47317,14,US,2020-04-06,0,2020-04-06 23:22:15,41.002505,-83.668389,Ohio,0
47318,0,US,2020-04-06,0,2020-04-06 23:22:15,36.526929,-83.223758,Tennessee,0
47310,3,US,2020-04-06,0,2020-04-06 23:22:15,32.774196,-81.138456,South Carolina,0


In [57]:
full_data = pd.concat([covid19_data_type_1, covid19_data_type_2], axis=0, ignore_index=True, sort=True)

In [58]:
full_data.head()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
0,1.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Anhui,0.0
1,14.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Beijing,0.0
2,6.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Chongqing,0.0
3,1.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Fujian,0.0
4,0.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Gansu,0.0


In [59]:
full_data = full_data.rename(columns={"Date":"DateRep"})

In [60]:
#Making sure there are no remaining "Mainland China"
full_data.replace({"Mainland China": "China"}, inplace=True)

In [61]:
full_data.head()

Unnamed: 0,Confirmed,Country/Region,DateRep,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
0,1.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Anhui,0.0
1,14.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Beijing,0.0
2,6.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Chongqing,0.0
3,1.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Fujian,0.0
4,0.0,China,01-22-2020,0.0,2020-01-22 17:00:00,,,Gansu,0.0


In [62]:
full_data.DateRep = pd.to_datetime(full_data.DateRep)

In [63]:
full_data.sort_values("DateRep", inplace=True)

In [64]:
full_data.DateRep = pd.to_datetime(full_data.DateRep).dt.date

In [65]:
full_data.head()

Unnamed: 0,Confirmed,Country/Region,DateRep,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered
0,1.0,China,2020-01-22,0.0,2020-01-22 17:00:00,,,Anhui,0.0
21,1.0,China,2020-01-22,0.0,2020-01-22 17:00:00,,,Ningxia,0.0
22,0.0,China,2020-01-22,0.0,2020-01-22 17:00:00,,,Qinghai,0.0
23,0.0,China,2020-01-22,0.0,2020-01-22 17:00:00,,,Shaanxi,0.0
24,2.0,China,2020-01-22,0.0,2020-01-22 17:00:00,,,Shandong,0.0


In [66]:
full_data.drop(["Latitude", "Longitude", "Last Update"], axis=1, inplace=True)

In [67]:
full_data.head()

Unnamed: 0,Confirmed,Country/Region,DateRep,Deaths,Province/State,Recovered
0,1.0,China,2020-01-22,0.0,Anhui,0.0
21,1.0,China,2020-01-22,0.0,Ningxia,0.0
22,0.0,China,2020-01-22,0.0,Qinghai,0.0
23,0.0,China,2020-01-22,0.0,Shaanxi,0.0
24,2.0,China,2020-01-22,0.0,Shandong,0.0


In [68]:
full_data = full_data.groupby(["Country/Region","DateRep"]).sum().reset_index()

In [69]:
full_data["Day of supose contagios"] = full_data["DateRep"] - dt.timedelta(days=15)

In [70]:
full_data

Unnamed: 0,Country/Region,DateRep,Confirmed,Deaths,Recovered,Day of supose contagios
0,Azerbaijan,2020-02-28,1.0,0.0,0.0,2020-02-13
1,Afghanistan,2020-02-24,1.0,0.0,0.0,2020-02-09
2,Afghanistan,2020-02-25,1.0,0.0,0.0,2020-02-10
3,Afghanistan,2020-02-26,1.0,0.0,0.0,2020-02-11
4,Afghanistan,2020-02-27,1.0,0.0,0.0,2020-02-12
...,...,...,...,...,...,...
6557,occupied Palestinian territory,2020-03-12,0.0,0.0,0.0,2020-02-26
6558,occupied Palestinian territory,2020-03-14,0.0,0.0,0.0,2020-02-28
6559,occupied Palestinian territory,2020-03-15,0.0,0.0,0.0,2020-02-29
6560,occupied Palestinian territory,2020-03-16,0.0,0.0,0.0,2020-03-01


In [71]:
full_data.set_index("DateRep", inplace=True)

In [72]:
full_data = full_data.groupby(["Country/Region", "DateRep"]).sum().reset_index().set_index("DateRep").sort_values(["Country/Region","DateRep"])

In [73]:
full_data.head()

Unnamed: 0_level_0,Country/Region,Confirmed,Deaths,Recovered
DateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-02-28,Azerbaijan,1.0,0.0,0.0
2020-02-24,Afghanistan,1.0,0.0,0.0
2020-02-25,Afghanistan,1.0,0.0,0.0
2020-02-26,Afghanistan,1.0,0.0,0.0
2020-02-27,Afghanistan,1.0,0.0,0.0


In [74]:
full_data2 = full_data.copy()
full_data2.rename(columns={"Confirmed":"NewConfCases_cum", "Deaths":"NewDeaths_cum", "Recovered":"NewRecovered_cum"},
                   inplace=True)

In [75]:
full_data2.head()

Unnamed: 0_level_0,Country/Region,NewConfCases_cum,NewDeaths_cum,NewRecovered_cum
DateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-02-28,Azerbaijan,1.0,0.0,0.0
2020-02-24,Afghanistan,1.0,0.0,0.0
2020-02-25,Afghanistan,1.0,0.0,0.0
2020-02-26,Afghanistan,1.0,0.0,0.0
2020-02-27,Afghanistan,1.0,0.0,0.0


In [76]:
full_data2[full_data2["Country/Region"] == "China"].head()

Unnamed: 0_level_0,Country/Region,NewConfCases_cum,NewDeaths_cum,NewRecovered_cum
DateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-22,China,547.0,17.0,28.0
2020-01-23,China,639.0,18.0,30.0
2020-01-24,China,916.0,26.0,36.0
2020-01-25,China,1399.0,42.0,39.0
2020-01-26,China,2062.0,56.0,49.0


In [77]:
full_data2["NewConfCases"] = full_data2["NewConfCases_cum"] - full_data2["NewConfCases_cum"].shift(1).where(
    cond=(full_data2.loc[:,'Country/Region'] == full_data2.shift(1).loc[:,'Country/Region'])).fillna(0)

full_data2["NewDeaths"] = full_data2["NewDeaths_cum"] - full_data2["NewDeaths_cum"].shift(1).where(
    cond=(full_data2.loc[:,'Country/Region'] == full_data2.shift(1).loc[:,'Country/Region'])).fillna(0)

full_data2["NewRecovered"] = full_data2["NewRecovered_cum"] - full_data2["NewRecovered_cum"].shift(1).where(
    cond=(full_data2.loc[:,'Country/Region'] == full_data2.shift(1).loc[:,'Country/Region'])).fillna(0)

In [78]:
full_data2.head()

Unnamed: 0_level_0,Country/Region,NewConfCases_cum,NewDeaths_cum,NewRecovered_cum,NewConfCases,NewDeaths,NewRecovered
DateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-02-28,Azerbaijan,1.0,0.0,0.0,1.0,0.0,0.0
2020-02-24,Afghanistan,1.0,0.0,0.0,1.0,0.0,0.0
2020-02-25,Afghanistan,1.0,0.0,0.0,0.0,0.0,0.0
2020-02-26,Afghanistan,1.0,0.0,0.0,0.0,0.0,0.0
2020-02-27,Afghanistan,1.0,0.0,0.0,0.0,0.0,0.0


In [79]:
full_data2[full_data2["Country/Region"] == "China"].head()

Unnamed: 0_level_0,Country/Region,NewConfCases_cum,NewDeaths_cum,NewRecovered_cum,NewConfCases,NewDeaths,NewRecovered
DateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-22,China,547.0,17.0,28.0,547.0,17.0,28.0
2020-01-23,China,639.0,18.0,30.0,92.0,1.0,2.0
2020-01-24,China,916.0,26.0,36.0,277.0,8.0,6.0
2020-01-25,China,1399.0,42.0,39.0,483.0,16.0,3.0
2020-01-26,China,2062.0,56.0,49.0,663.0,14.0,10.0


In [80]:
full_data2.sort_values(["Country/Region", "DateRep"], inplace=True)

In [81]:
full_data2["activos"] = full_data2["NewConfCases_cum"] -full_data2["NewDeaths_cum"] - full_data2["NewRecovered_cum"]

In [82]:
pruebas = full_data2.copy()

In [83]:
pruebas[pruebas["Country/Region"] == "China"].tail(30)

Unnamed: 0_level_0,Country/Region,NewConfCases_cum,NewDeaths_cum,NewRecovered_cum,NewConfCases,NewDeaths,NewRecovered,activos
DateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-08,China,80699.0,3097.0,57320.0,47.0,27.0,1842.0,20282.0
2020-03-09,China,80735.0,3120.0,58735.0,36.0,23.0,1415.0,18880.0
2020-03-10,China,80757.0,3136.0,60106.0,22.0,16.0,1371.0,17515.0
2020-03-11,China,80921.0,3161.0,61644.0,164.0,25.0,1538.0,16116.0
2020-03-12,China,80932.0,3172.0,62901.0,11.0,11.0,1257.0,14859.0
2020-03-13,China,80945.0,3180.0,64196.0,13.0,8.0,1295.0,13569.0
2020-03-14,China,80977.0,3193.0,65660.0,32.0,13.0,1464.0,12124.0
2020-03-15,China,81003.0,3203.0,67017.0,26.0,10.0,1357.0,10783.0
2020-03-16,China,81033.0,3217.0,67910.0,30.0,14.0,893.0,9906.0
2020-03-17,China,81058.0,3230.0,68798.0,25.0,13.0,888.0,9030.0


In [84]:
full_data2.rename(columns={"Country/Region":"CountryExp"}, inplace=True)

In [85]:
full_data2["CountryExp"].unique()

array([' Azerbaijan', 'Afghanistan', 'Albania', 'Algeria', 'Andorra',
       'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahamas, The',
       'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
       'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burma', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad',
       'Channel Islands', 'Chile', 'China', 'Colombia',
       'Congo (Brazzaville)', 'Congo (Kinshasa)', 'Costa Rica',
       "Cote d'Ivoire", 'Croatia', 'Cruise Ship', 'Cuba', 'Curacao',
       'Cyprus', 'Czech Republic', 'Czechia', 'Denmark',
       'Diamond Princess', 'Djibouti', 'Dominica', 'Dominican Republic',
       'East Timor', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Esw

In [86]:
full_data2.index.min()

datetime.date(2020, 1, 22)

In [87]:
full_data2.index.max()

datetime.date(2020, 4, 6)

In [88]:
full_data2.head()

Unnamed: 0_level_0,CountryExp,NewConfCases_cum,NewDeaths_cum,NewRecovered_cum,NewConfCases,NewDeaths,NewRecovered,activos
DateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-02-28,Azerbaijan,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2020-02-24,Afghanistan,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2020-02-25,Afghanistan,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2020-02-26,Afghanistan,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2020-02-27,Afghanistan,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [89]:
full_data2[(full_data2.CountryExp =="China") & (full_data2.NewConfCases < 0)]

Unnamed: 0_level_0,CountryExp,NewConfCases_cum,NewDeaths_cum,NewRecovered_cum,NewConfCases,NewDeaths,NewRecovered,activos
DateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


# Export

In [90]:
import pickle
file_name = r"data_V2_for_change_in_exponential_growth.pkl"
with open(os.path.join(PATH_ROOT_INTERIM_DATA,file_name), 'wb') as f:
    pickle.dump(full_data2, f)