<a href="https://colab.research.google.com/github/andrewcgaitskell/voila/blob/master/notebooks/ReshapeTimeSeries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [109]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import datetime
from matplotlib.ticker import (AutoMinorLocator, MultipleLocator)
import wget
from sqlalchemy import create_engine
import psycopg2

In [110]:
cwd = os.getcwd()
cwd

'/home/sweep0ne/voila_current/notebooks/Get'

In [111]:
import seaborn as sns
sns.set()

In [112]:
urlconfirmed = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
urldeaths = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
urlrecovered = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"

In [113]:
filename = wget.download(urlconfirmed)
filename

'time_series_covid19_confirmed_global.csv'

In [114]:
confirmed = pd.read_csv(filename)

In [115]:
os.remove(filename) 

In [116]:
filename = wget.download(urldeaths)
filename

'time_series_covid19_deaths_global.csv'

In [117]:
deaths = pd.read_csv(filename)

In [118]:
os.remove(filename) 

In [119]:
filename = wget.download(urlrecovered)
filename

'time_series_covid19_recovered_global.csv'

In [120]:
recovered = pd.read_csv(filename)

In [121]:
os.remove(filename) 

In [122]:
confirmed['source'] = 'confirmed'

In [123]:
deaths['source'] = 'deaths'

In [124]:
recovered['source'] = 'recovered'

In [125]:
frames = [confirmed, deaths, recovered]
combined = pd.concat(frames)

In [128]:
#combined country and state columns
combined['state_length'] = combined['Province/State'].str.len().fillna(0)
combined['CountryName'] = np.where(combined['state_length']>0, combined['Province/State'], combined['Country/Region'])
combined['RegionName'] = 'N/A'
#combined['CountryName'] = combined['Country/Region']
combined.fillna({'RegionName':'N/A'},inplace=True)
combined.head(5)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/14/20,9/15/20,9/16/20,9/17/20,9/18/20,9/19/20,source,state_length,CountryName,RegionName
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,38772,38815,38855,38872,38883,38919,confirmed,0.0,Afghanistan,
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,11520,11672,11816,11948,12073,12226,confirmed,0.0,Albania,
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,48496,48734,48966,49194,49413,49623,confirmed,0.0,Algeria,
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,1438,1438,1483,1483,1564,1564,confirmed,0.0,Andorra,
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,3439,3569,3675,3789,3848,3901,confirmed,0.0,Angola,


In [129]:
# time series data needs to be melted to then allow dates to be read in correctly
df_cols_index = combined.columns
df_cols_list = df_cols_index.tolist()
#remove non date columns - this allows dynamic update and melting of data
not_dates = ['Province/State','Country/Region','Lat','Long','state_length','CountryName','RegionName', 'source']
date_list = []
for dates in df_cols_list:
    if dates not in not_dates :
        date_list.append(dates)

melted = pd.melt(combined, id_vars=['CountryName','RegionName','source'], value_vars=date_list)
melted['Date'] = pd.to_datetime(melted['variable'], infer_datetime_format=True)

In [130]:
anchordate = datetime.datetime(2020, 1, 1)

#filter out any 0's before any discovery
all_data = melted[melted['value']>0]
earliest_date = all_data.groupby(["CountryName","RegionName"])[['Date']].min().reset_index()
earliest_date.rename(columns={'Date':'EarliestConfirmedDate'}, inplace=True)
#join with main discovered dataset
data_withearliest = all_data.merge(earliest_date,how = 'inner',  left_on=['CountryName','RegionName'], right_on=['CountryName','RegionName'], suffixes = ['_l','_r'])
data_withearliest['DaysFromStart'] = (data_withearliest['Date'] - data_withearliest['EarliestConfirmedDate']).dt.days
data_withearliest['DaysFromAnchor'] = (data_withearliest['Date'] - anchordate).dt.days
data_withearliest.head()

Unnamed: 0,CountryName,RegionName,source,variable,value,Date,EarliestConfirmedDate,DaysFromStart,DaysFromAnchor
0,Anhui,,confirmed,1/22/20,1,2020-01-22,2020-01-22,0,21
1,Anhui,,confirmed,1/23/20,9,2020-01-23,2020-01-22,1,22
2,Anhui,,confirmed,1/24/20,15,2020-01-24,2020-01-22,2,23
3,Anhui,,confirmed,1/25/20,39,2020-01-25,2020-01-22,3,24
4,Anhui,,confirmed,1/26/20,60,2020-01-26,2020-01-22,4,25


In [131]:
all_data

Unnamed: 0,CountryName,RegionName,source,variable,value,Date
56,Anhui,,confirmed,1/22/20,1,2020-01-22
57,Beijing,,confirmed,1/22/20,14,2020-01-22
58,Chongqing,,confirmed,1/22/20,6,2020-01-22
59,Fujian,,confirmed,1/22/20,1,2020-01-22
61,Guangdong,,confirmed,1/22/20,26,2020-01-22
...,...,...,...,...,...,...
189965,West Bank and Gaza,,recovered,9/19/20,23446,2020-09-19
189966,Western Sahara,,recovered,9/19/20,8,2020-09-19
189967,Yemen,,recovered,9/19/20,1221,2020-09-19
189968,Zambia,,recovered,9/19/20,13365,2020-09-19


In [132]:
# plotting all countries is impossible, so wanted to rank coutries based on Total discovered
max_source = all_data.groupby(['CountryName','RegionName','source'])[['value']].max().reset_index()
#sum_discovered = all_discovered.groupby(['Country_State'])[['value']].sum().reset_index()
max_source.rename(columns = {'value':'Max Source'}, inplace = True) 
#sum_discovered

In [133]:
data_withmax = data_withearliest.merge(max_source,how = 'inner',  left_on=['CountryName','RegionName','source'],
                                       right_on=['CountryName','RegionName','source'], suffixes = ['_l','_r'])
data_withmax['TrendValue'] = data_withmax['value']/data_withmax['Max Source']

#sd = sum_discovered.sort_values('value',ascending=False)
#sd.head(25)

In [134]:
max_confirmed = max_source[max_source['source']=='confirmed'].copy()
max_confirmed['Region_Rank'] = max_confirmed['Max Source'].rank()

In [135]:
md = max_confirmed[['Region_Rank','CountryName','RegionName']]
# add rank data to main dataset
data_withrank = data_withmax.merge(md, how = 'inner',  left_on=['CountryName','RegionName'], right_on=['CountryName','RegionName'], suffixes = ['_l','_r'])
data_withrank

Unnamed: 0,CountryName,RegionName,source,variable,value,Date,EarliestConfirmedDate,DaysFromStart,DaysFromAnchor,Max Source,TrendValue,Region_Rank
0,Anhui,,confirmed,1/22/20,1,2020-01-22,2020-01-22,0,21,991,0.001009,87.0
1,Anhui,,confirmed,1/23/20,9,2020-01-23,2020-01-22,1,22,991,0.009082,87.0
2,Anhui,,confirmed,1/24/20,15,2020-01-24,2020-01-22,2,23,991,0.015136,87.0
3,Anhui,,confirmed,1/25/20,39,2020-01-25,2020-01-22,3,24,991,0.039354,87.0
4,Anhui,,confirmed,1/26/20,60,2020-01-26,2020-01-22,4,25,991,0.060545,87.0
...,...,...,...,...,...,...,...,...,...,...,...,...
142426,Lesotho,,deaths,9/15/20,33,2020-09-15,2020-05-13,125,258,33,1.000000,99.0
142427,Lesotho,,deaths,9/16/20,33,2020-09-16,2020-05-13,126,259,33,1.000000,99.0
142428,Lesotho,,deaths,9/17/20,33,2020-09-17,2020-05-13,127,260,33,1.000000,99.0
142429,Lesotho,,deaths,9/18/20,33,2020-09-18,2020-05-13,128,261,33,1.000000,99.0


In [136]:
# more control over sub plots
import matplotlib.pyplot as plt

In [137]:
# looking at how to split up the countries
#fig, ax = plt.subplots()
#fig = plt.figure()
regionranksbuckets = data_withrank.groupby(['CountryName','RegionName'])[['Region_Rank']].max().reset_index()
BucketSize = 20
regionranksbuckets['Bucket'] = regionranksbuckets['Region_Rank']/BucketSize

#countryranksbuckets['Rounded_Bucket'] = countryranksbuckets.round({countryranksbuckets['Bucket']:0})
newdf = regionranksbuckets.round({"Bucket":0})
newdf['RankMin'] = (newdf['Bucket']*BucketSize) - BucketSize
newdf['RankMax'] = (newdf['Bucket']*BucketSize)
#newdf.groupby(['Country/Region'])[['Country_Rank']].max().reset_index()
newdf1 = newdf.groupby(['Bucket','RankMin','RankMax']).count().reset_index()
newdf1['CumSum'] = newdf1['Region_Rank'].cumsum(axis = 0) 
newdf1

Unnamed: 0,Bucket,RankMin,RankMax,CountryName,RegionName,Region_Rank,CumSum
0,0.0,-20.0,0.0,10,10,10,10
1,1.0,0.0,20.0,19,19,19,29
2,2.0,20.0,40.0,21,21,21,50
3,3.0,40.0,60.0,19,19,19,69
4,4.0,60.0,80.0,21,21,21,90
5,5.0,80.0,100.0,19,19,19,109
6,6.0,100.0,120.0,21,21,21,130
7,7.0,120.0,140.0,19,19,19,149
8,8.0,140.0,160.0,21,21,21,170
9,9.0,160.0,180.0,19,19,19,189


In [138]:
#datatoplot = data_withrank[(data_withrank['Country_Rank']>240) & (data_withrank['Country/Region'] != 'China') ]
#figuredata = datatoplot[datatoplot['Country_State'] == 'China_Hubei']
#figuredata

In [139]:
engine = create_engine('postgresql://pythonuser:pythonuser@localhost:5432/data')
#data_withrank.to_sql('tbl_CovidCDRTrends', engine, if_exists='replace')

In [140]:
data_withrank.columns

Index(['CountryName', 'RegionName', 'source', 'variable', 'value', 'Date',
       'EarliestConfirmedDate', 'DaysFromStart', 'DaysFromAnchor',
       'Max Source', 'TrendValue', 'Region_Rank'],
      dtype='object')

In [141]:
cwd = os.getcwd()
path2file =  cwd + '/' + 'data.csv'
data_withrank.to_csv(path2file)

In [142]:
import os
cwd = os.getcwd()
sqlfilename = cwd + '/CreateCDRTrendsTable.sql'
f = open(sqlfilename, "r")
fs = f.read()

In [143]:
connection = engine.raw_connection()
try:
    cursor = connection.cursor()
    cursor.execute(fs)
finally:
    connection.commit()
    connection.close()

In [144]:
sqlcmnd = 'COPY "raw_CDRTrends" FROM \''+ path2file + '\' DELIMITER \',\' CSV;'
sqlcmnd

'COPY "raw_CDRTrends" FROM \'/home/sweep0ne/voila_current/notebooks/Get/data.csv\' DELIMITER \',\' CSV;'

In [145]:
with engine.connect().execution_options(autocommit=True) as con:

    con.execute(sqlcmnd)

In [146]:
sqlfilename = cwd + '/PopCDRTrendsTables.sql'
f = open(sqlfilename, "r")
fs = f.read()

In [147]:
connection = engine.raw_connection()
try:
    cursor = connection.cursor()
    cursor.execute(fs)
finally:
    connection.commit()
    connection.close()