<a href="https://colab.research.google.com/github/andrewcgaitskell/voila/blob/master/notebooks/ReshapeTimeSeries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import datetime
from matplotlib.ticker import (AutoMinorLocator, MultipleLocator)
import wget
from sqlalchemy import create_engine
import psycopg2

In [2]:
cwd = os.getcwd()
cwd

'/home/andrewcgaitskell/Documents/Code/virus-data-voila/notebooks/Get'

In [3]:
import seaborn as sns
sns.set()

In [4]:
urlconfirmed = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
urldeaths = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
urlrecovered = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"

In [5]:
filename = wget.download(urlconfirmed)
filename

100% [............................................................................] 708270 / 708270

'time_series_covid19_confirmed_global.csv'

In [6]:
confirmed = pd.read_csv(filename)

In [7]:
os.remove(filename) 

In [8]:
filename = wget.download(urldeaths)
filename

100% [............................................................................] 491810 / 491810

'time_series_covid19_deaths_global.csv'

In [9]:
deaths = pd.read_csv(filename)

In [10]:
os.remove(filename) 

In [11]:
filename = wget.download(urlrecovered)
filename

100% [............................................................................] 640087 / 640087

'time_series_covid19_recovered_global.csv'

In [12]:
recovered = pd.read_csv(filename)

In [13]:
os.remove(filename) 

In [14]:
confirmed['source'] = 'confirmed'

In [15]:
deaths['source'] = 'deaths'

In [16]:
recovered['source'] = 'recovered'

In [17]:
frames = [confirmed, deaths, recovered]
combined = pd.concat(frames)

In [18]:
#combined country and state columns
combined['state_length'] = combined['Province/State'].str.len().fillna(0)
combined['CountryName'] = np.where(combined['state_length']>0, combined['Province/State'], combined['Country/Region'])
combined['RegionName'] = 'N/A'
#combined['CountryName'] = combined['Country/Region']
combined.fillna({'RegionName':'N/A'},inplace=True)
combined.head(5)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/29/21,6/30/21,7/1/21,7/2/21,7/3/21,7/4/21,source,state_length,CountryName,RegionName
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,117158,118659,120216,120216,120216,120216,confirmed,0.0,Afghanistan,
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,132514,132521,132523,132526,132534,132535,confirmed,0.0,Albania,
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,139229,139626,140075,140550,141007,141471,confirmed,0.0,Algeria,
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,13900,13911,13918,13918,13918,13918,confirmed,0.0,Andorra,
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,38682,38849,38965,39089,39172,39230,confirmed,0.0,Angola,


In [19]:
# time series data needs to be melted to then allow dates to be read in correctly
df_cols_index = combined.columns
df_cols_list = df_cols_index.tolist()
#remove non date columns - this allows dynamic update and melting of data
not_dates = ['Province/State','Country/Region','Lat','Long','state_length','CountryName','RegionName', 'source']
date_list = []
for dates in df_cols_list:
    if dates not in not_dates :
        date_list.append(dates)

melted = pd.melt(combined, id_vars=['CountryName','RegionName','source'], value_vars=date_list)
melted['Date'] = pd.to_datetime(melted['variable'], infer_datetime_format=True)

In [20]:
anchordate = datetime.datetime(2020, 1, 1)

#filter out any 0's before any discovery
all_data = melted[melted['value']>0]
earliest_date = all_data.groupby(["CountryName","RegionName"])[['Date']].min().reset_index()
earliest_date.rename(columns={'Date':'EarliestConfirmedDate'}, inplace=True)
#join with main discovered dataset
data_withearliest = all_data.merge(earliest_date,how = 'inner',  left_on=['CountryName','RegionName'], right_on=['CountryName','RegionName'], suffixes = ['_l','_r'])
data_withearliest['DaysFromStart'] = (data_withearliest['Date'] - data_withearliest['EarliestConfirmedDate']).dt.days
data_withearliest['DaysFromAnchor'] = (data_withearliest['Date'] - anchordate).dt.days
data_withearliest.head()

Unnamed: 0,CountryName,RegionName,source,variable,value,Date,EarliestConfirmedDate,DaysFromStart,DaysFromAnchor
0,Anhui,,confirmed,1/22/20,1,2020-01-22,2020-01-22,0,21
1,Anhui,,confirmed,1/23/20,9,2020-01-23,2020-01-22,1,22
2,Anhui,,confirmed,1/24/20,15,2020-01-24,2020-01-22,2,23
3,Anhui,,confirmed,1/25/20,39,2020-01-25,2020-01-22,3,24
4,Anhui,,confirmed,1/26/20,60,2020-01-26,2020-01-22,4,25


In [21]:
all_data

Unnamed: 0,CountryName,RegionName,source,variable,value,Date
58,Anhui,,confirmed,1/22/20,1,2020-01-22
59,Beijing,,confirmed,1/22/20,14,2020-01-22
60,Chongqing,,confirmed,1/22/20,6,2020-01-22
61,Fujian,,confirmed,1/22/20,1,2020-01-22
63,Guangdong,,confirmed,1/22/20,26,2020-01-22
...,...,...,...,...,...,...
435655,Vietnam,,recovered,7/4/21,7819,2021-07-04
435656,West Bank and Gaza,,recovered,7/4/21,308774,2021-07-04
435657,Yemen,,recovered,7/4/21,4104,2021-07-04
435658,Zambia,,recovered,7/4/21,143128,2021-07-04


In [22]:
# plotting all countries is impossible, so wanted to rank coutries based on Total discovered
max_source = all_data.groupby(['CountryName','RegionName','source'])[['value']].max().reset_index()
#sum_discovered = all_discovered.groupby(['Country_State'])[['value']].sum().reset_index()
max_source.rename(columns = {'value':'Max Source'}, inplace = True) 
#sum_discovered

In [23]:
data_withmax = data_withearliest.merge(max_source,how = 'inner',  left_on=['CountryName','RegionName','source'],
                                       right_on=['CountryName','RegionName','source'], suffixes = ['_l','_r'])
data_withmax['TrendValue'] = data_withmax['value']/data_withmax['Max Source']

#sd = sum_discovered.sort_values('value',ascending=False)
#sd.head(25)

In [24]:
max_confirmed = max_source[max_source['source']=='confirmed'].copy()
max_confirmed['Region_Rank'] = max_confirmed['Max Source'].rank()

In [25]:
md = max_confirmed[['Region_Rank','CountryName','RegionName']]
# add rank data to main dataset
data_withrank = data_withmax.merge(md, how = 'inner',  left_on=['CountryName','RegionName'], right_on=['CountryName','RegionName'], suffixes = ['_l','_r'])
data_withrank

Unnamed: 0,CountryName,RegionName,source,variable,value,Date,EarliestConfirmedDate,DaysFromStart,DaysFromAnchor,Max Source,TrendValue,Region_Rank
0,Anhui,,confirmed,1/22/20,1,2020-01-22,2020-01-22,0,21,1008,0.000992,61.0
1,Anhui,,confirmed,1/23/20,9,2020-01-23,2020-01-22,1,22,1008,0.008929,61.0
2,Anhui,,confirmed,1/24/20,15,2020-01-24,2020-01-22,2,23,1008,0.014881,61.0
3,Anhui,,confirmed,1/25/20,39,2020-01-25,2020-01-22,3,24,1008,0.038690,61.0
4,Anhui,,confirmed,1/26/20,60,2020-01-26,2020-01-22,4,25,1008,0.059524,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...
362167,Summer Olympics 2020,,confirmed,6/30/21,1,2021-06-30,2021-06-19,11,546,6,0.166667,10.0
362168,Summer Olympics 2020,,confirmed,7/1/21,1,2021-07-01,2021-06-19,12,547,6,0.166667,10.0
362169,Summer Olympics 2020,,confirmed,7/2/21,1,2021-07-02,2021-06-19,13,548,6,0.166667,10.0
362170,Summer Olympics 2020,,confirmed,7/3/21,1,2021-07-03,2021-06-19,14,549,6,0.166667,10.0


In [26]:
# more control over sub plots
import matplotlib.pyplot as plt

In [27]:
# looking at how to split up the countries
#fig, ax = plt.subplots()
#fig = plt.figure()
regionranksbuckets = data_withrank.groupby(['CountryName','RegionName'])[['Region_Rank']].max().reset_index()
BucketSize = 20
regionranksbuckets['Bucket'] = regionranksbuckets['Region_Rank']/BucketSize

#countryranksbuckets['Rounded_Bucket'] = countryranksbuckets.round({countryranksbuckets['Bucket']:0})
newdf = regionranksbuckets.round({"Bucket":0})
newdf['RankMin'] = (newdf['Bucket']*BucketSize) - BucketSize
newdf['RankMax'] = (newdf['Bucket']*BucketSize)
#newdf.groupby(['Country/Region'])[['Country_Rank']].max().reset_index()
newdf1 = newdf.groupby(['Bucket','RankMin','RankMax']).count().reset_index()
newdf1['CumSum'] = newdf1['Region_Rank'].cumsum(axis = 0) 
newdf1

Unnamed: 0,Bucket,RankMin,RankMax,CountryName,RegionName,Region_Rank,CumSum
0,0.0,-20.0,0.0,10,10,10,10
1,1.0,0.0,20.0,19,19,19,29
2,2.0,20.0,40.0,21,21,21,50
3,3.0,40.0,60.0,19,19,19,69
4,4.0,60.0,80.0,21,21,21,90
5,5.0,80.0,100.0,19,19,19,109
6,6.0,100.0,120.0,21,21,21,130
7,7.0,120.0,140.0,19,19,19,149
8,8.0,140.0,160.0,21,21,21,170
9,9.0,160.0,180.0,19,19,19,189


In [28]:
#datatoplot = data_withrank[(data_withrank['Country_Rank']>240) & (data_withrank['Country/Region'] != 'China') ]
#figuredata = datatoplot[datatoplot['Country_State'] == 'China_Hubei']
#figuredata

In [29]:
engine = create_engine('postgresql://pythonuser:pythonuser@localhost:5432/data')
#data_withrank.to_sql('tbl_CovidCDRTrends', engine, if_exists='replace')

In [30]:
data_withrank.columns

Index(['CountryName', 'RegionName', 'source', 'variable', 'value', 'Date',
       'EarliestConfirmedDate', 'DaysFromStart', 'DaysFromAnchor',
       'Max Source', 'TrendValue', 'Region_Rank'],
      dtype='object')

In [31]:
cwd = os.getcwd()
path2file =  cwd + '/' + 'data.csv'
data_withrank.to_csv(path2file)

In [32]:
import os
cwd = os.getcwd()
sqlfilename = cwd + '/CreateCDRTrendsTable.sql'
f = open(sqlfilename, "r")
fs = f.read()

In [33]:
connection = engine.raw_connection()
try:
    cursor = connection.cursor()
    cursor.execute(fs)
finally:
    connection.commit()
    connection.close()

In [34]:
sqlcmnd = 'COPY "raw_CDRTrends" FROM \''+ path2file + '\' DELIMITER \',\' CSV;'
sqlcmnd

'COPY "raw_CDRTrends" FROM \'/home/andrewcgaitskell/Documents/Code/virus-data-voila/notebooks/Get/data.csv\' DELIMITER \',\' CSV;'

In [35]:
with engine.connect().execution_options(autocommit=True) as con:

    con.execute(sqlcmnd)

In [36]:
sqlfilename = cwd + '/PopCDRTrendsTables.sql'
f = open(sqlfilename, "r")
fs = f.read()

In [37]:
connection = engine.raw_connection()
try:
    cursor = connection.cursor()
    cursor.execute(fs)
finally:
    connection.commit()
    connection.close()