# COVID19 Data Visualizations & Models by State

- comments: false
- author: Ryan Gomez
- toc: true
- categories: [growth, compare, hospitalizations]
- image: images/covid-logo.png
- permalink: /futureproof-COVID19-AllStates/

In [1]:
#hide_input
# Imports
import os
import pandas as pd
import csv
import kaggle

# other imports
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from copy import copy
import seaborn as sns
from scipy.stats import norm
import matplotlib.dates as mdates
# import matplotlib.colors as mcolors
# import random
# import math
# import time
# from sklearn.linear_model import LinearRegression, BayesianRidge
# from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
# from sklearn.svm import SVR
from datetime import date, datetime
from dateutil.parser import parse
import us
# import operator 
# plt.style.use('fivethirtyeight')
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline 



# Main Datasets (w/ hospitalised data)

Source: https://covidtracking.com/
Source: https://github.com/CSSEGISandData/COVID-19
Post processed data source used in models below: https://github.com/bielsnor/futureproof/tree/master/_notebooks/results
Various state data, third party data, and various federal data

In [2]:
#hide_input

all_cases = pd.read_csv('https://covidtracking.com/api/v1/states/daily.csv')

# Delete unecessary rows
for row in ['negative', 'pending', 'hash', 'negativeIncrease', 'totalTestResults', 'totalTestResultsIncrease', 'dateChecked', 'fips', 'inIcuCumulative', 'onVentilatorCumulative', 'total', 'posNeg', 'deathIncrease', 'hospitalizedIncrease', 'positiveIncrease']:
    del all_cases[row]

# TODO missing values
#      Do we get avg or missing values, or predict them?
#      See https://developerzen.com/data-mining-handling-missing-values-the-database-bd2241882e72

for i, row in all_cases.iterrows():
    # Set Dates
    s = str(row['date'])
    all_cases.at[i, 'date'] = date(year=int(s[0:4]), month=int(s[4:6]), day=int(s[6:8]))

# Missing death figures means no death reports yet
# These are set to 0
for i, row in all_cases.iterrows():
    if np.isnan(row['death']):
        all_cases.at[i, 'death'] = 0

## Combine, validate, and verify data sets.

In [3]:
#hide_input

# TODO Replace active cases with JHU and/or regression model (Selma)
all_cases['active'] = all_cases['positive'] - all_cases['recovered'] - all_cases['death']
# change location of 'active' column
cols = list(all_cases)
cols.insert(3, cols.pop(cols.index('active')))
all_cases = all_cases.loc[:, cols]

In [4]:
#hide_input

# Load datasets for US population and Hospital beds per 1000
us_population = pd.read_csv('data/us_population.csv')
hosp_beds = pd.read_csv('data/hospital_beds.csv')
state_abbrev = pd.read_csv('data/us_state_names.csv')

# add state abbreviations to us_population and hospital beds dataframe
for state in state_abbrev['State'].tolist():
    # store state abbreviation in variable
    abbrev = state_abbrev.loc[state_abbrev['State'] == state, 'Abbreviation'].tolist()[0]
    # add abbrev to new column 'Abbreviation' in us_population df
    us_population.loc[us_population['State'] == state, 'Abbreviation'] = abbrev
    # add abbrev to new column in hosp_beds df
    hosp_beds.loc[hosp_beds['Location'] == state, 'Abbreviation'] = abbrev
    
# change order of columns of us_population
cols = list(us_population)
cols.insert(2, cols.pop(cols.index('Abbreviation')))
us_population = us_population.loc[:, cols]

# drop unnecessary columns of us_population
us_population = us_population.drop(columns=['rank', 'Growth', 'Pop2018', 'Pop2010', 'growthSince2010', 'Percent', 'density'])

# drop unnecessary columns of hosp_beds
hosp_beds = hosp_beds.drop(columns=['Location', 'State/Local Government', 'Non-Profit', 'For-Profit'])

# change order of columns of hosp_beds
cols = list(hosp_beds)
cols.insert(0, cols.pop(cols.index('Abbreviation')))
hosp_beds = hosp_beds.loc[:, cols]

In [5]:
#hide_input

# filter out non-existing states like 'AS'
all_cases = all_cases[all_cases['state'].isin(state_abbrev['Abbreviation'].tolist())]

In [6]:
# see what filtered main dataframe looks like for all 50 states: 
all_cases.head(50)

Unnamed: 0,date,state,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,dataQualityGrade,...,totalTestsViral,positiveTestsViral,negativeTestsViral,positiveCasesViral,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade
0,2020-07-03,AK,1063.0,509.0,25.0,,,3.0,539.0,A,...,120208.0,,,,0,0,0,0,0,
1,2020-07-03,AL,41865.0,18777.0,812.0,2883.0,,,22082.0,B,...,,,,41362.0,0,0,0,0,0,
2,2020-07-03,AR,22622.0,6177.0,285.0,1517.0,,70.0,16164.0,A,...,,,,22622.0,0,0,0,0,0,
4,2020-07-03,AZ,91858.0,79592.0,3013.0,5018.0,741.0,489.0,10478.0,A+,...,577919.0,,,91396.0,0,0,0,0,0,
5,2020-07-03,CA,248235.0,,7024.0,,1871.0,,,B,...,4448176.0,,,248235.0,0,0,0,0,0,
6,2020-07-03,CO,33352.0,27290.0,270.0,5527.0,,,4541.0,A,...,,,,30453.0,0,0,0,0,0,
7,2020-07-03,CT,46717.0,34172.0,95.0,10411.0,,,8210.0,B,...,497693.0,,,44741.0,0,0,0,0,0,
8,2020-07-03,DC,10435.0,8383.0,110.0,,38.0,24.0,1497.0,A+,...,,,,,0,0,0,0,0,
9,2020-07-03,DE,11923.0,4719.0,57.0,,13.0,,6692.0,A+,...,,,,10879.0,0,0,0,0,0,
10,2020-07-03,FL,178594.0,,,15795.0,,,,A,...,2513115.0,228506.0,2279841.0,178594.0,0,0,0,0,0,


In [7]:
#hide_input

# Split dataframes by date
df_split_by_date = dict(tuple(all_cases.groupby('date')))

# Split dataframes by state
df_split_by_state = dict(tuple(all_cases.groupby('state')))

In [8]:
#hide_input

# merge dataframes us_population and all_cases
df_merge_uspop = all_cases.merge(us_population, how='left', left_on='state', right_on='Abbreviation')
df_merge_uspop = df_merge_uspop.drop(columns=['Abbreviation'])
df_merge_uspop = df_merge_uspop.rename(columns={'Pop': 'population'})

# change location of 'population' column
cols = list(df_merge_uspop)
cols.insert(2, cols.pop(cols.index('population')))
df_merge_uspop = df_merge_uspop.loc[:, cols]

# merge dataframes hosp_beds and df_merge_uspop
df_merge_hosp = df_merge_uspop.merge(hosp_beds, how='left', left_on='state', right_on='Abbreviation')
df_merge_hosp = df_merge_hosp.drop(columns=['Abbreviation'])
all_cases = df_merge_hosp.rename(columns={'Total': 'bedsPerThousand'})

In [9]:
#hide_input

# Calculate the total beds, and add the column
all_cases['total_beds'] = all_cases['population'] / 1000 * all_cases['bedsPerThousand']

In [10]:
#hide_input

# change abbreviations to state names
all_cases = all_cases.rename(columns={'state': 'abbrev'})
all_cases = all_cases.rename(columns={'State': 'state'})

In [11]:
#hide_input

# change location of 'state' column
cols = list(all_cases)
cols.insert(1, cols.pop(cols.index('state')))
all_cases = all_cases.loc[:, cols]

In [12]:
#Add state level data, beds, beds/1k, population, abbreviation, and name:
all_cases.head(50)

Unnamed: 0,date,state,abbrev,population,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,...,negativeTestsViral,positiveCasesViral,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade,bedsPerThousand,total_beds
0,2020-07-03,Alaska,AK,734002,1063.0,509.0,25.0,,,3.0,...,,,0,0,0,0,0,,2.2,1614.8044
1,2020-07-03,Alabama,AL,4908621,41865.0,18777.0,812.0,2883.0,,,...,,41362.0,0,0,0,0,0,,3.1,15216.7251
2,2020-07-03,Arkansas,AR,3038999,22622.0,6177.0,285.0,1517.0,,70.0,...,,22622.0,0,0,0,0,0,,3.2,9724.7968
3,2020-07-03,Arizona,AZ,7378494,91858.0,79592.0,3013.0,5018.0,741.0,489.0,...,,91396.0,0,0,0,0,0,,1.9,14019.1386
4,2020-07-03,California,CA,39937489,248235.0,,7024.0,,1871.0,,...,,248235.0,0,0,0,0,0,,1.8,71887.4802
5,2020-07-03,Colorado,CO,5845526,33352.0,27290.0,270.0,5527.0,,,...,,30453.0,0,0,0,0,0,,1.9,11106.4994
6,2020-07-03,Connecticut,CT,3563077,46717.0,34172.0,95.0,10411.0,,,...,,44741.0,0,0,0,0,0,,2.0,7126.154
7,2020-07-03,District of Columbia,DC,720687,10435.0,8383.0,110.0,,38.0,24.0,...,,,0,0,0,0,0,,4.4,3171.0228
8,2020-07-03,Delaware,DE,982895,11923.0,4719.0,57.0,,13.0,,...,,10879.0,0,0,0,0,0,,2.2,2162.369
9,2020-07-03,Florida,FL,21992985,178594.0,,,15795.0,,,...,2279841.0,178594.0,0,0,0,0,0,,2.6,57181.761


- Load and clean JHU data
- Merge JHU dataset with main dataset

In [13]:
#hide_input

# This cell takes some time, as it needs to connect to Kaggle Servers to retrieve data
kaggle.api.authenticate()
kaggle.api.dataset_download_files('benhamner/jhucovid19', path='./kaggle/input/jhucovid19/', unzip=True)

In [14]:
#hide_input

# Get Time-Series Data of cases as Pandas DataFrame
dir_jhu = './kaggle/input/jhucovid19/csse_covid_19_data/csse_covid_19_daily_reports'

df_list = []
for dirname, _, files in os.walk(dir_jhu):
    for file in files:
        if 'gitignore' not in file and 'README' not in file:
            full_dir = os.path.join(dirname, file)
            df_list.append(pd.read_csv(full_dir))
            
jhu_df = pd.concat(df_list, axis=0, ignore_index=True, sort=True)

# convert Last Update columns to datetime format
jhu_df.loc[:, 'Last Update'] = pd.to_datetime(jhu_df['Last Update']).apply(lambda x: x.date())
jhu_df.loc[:, 'Last_Update'] = pd.to_datetime(jhu_df['Last_Update']).apply(lambda x: x.date())

# Combine Last Update with Last_Update
jhu_df['LastUpdate'] = jhu_df['Last_Update'].combine_first(jhu_df['Last Update'])

# Combine Country/Region with Country_Region
jhu_df['CountryRegion'] = jhu_df['Country/Region'].combine_first(jhu_df['Country_Region'])

# Retrieve only US data
jhu_df = jhu_df[jhu_df['CountryRegion']=='US']

# Combine Province/State with Province_State
jhu_df['ProvinceState'] = jhu_df['Province/State'].combine_first(jhu_df['Province_State'])

# Drop unnecessary columns
jhu_df = jhu_df.drop(['Admin2', 'Lat', 'Latitude', 'Long_', 'Longitude', 'Combined_Key', 'Country/Region',
                      'Country_Region', 'Province/State', 'Province_State',
                      'Last Update', 'Last_Update', 'FIPS'], axis=1)

# Change column order
cols = list(jhu_df)
cols.insert(0, cols.pop(cols.index('CountryRegion')))
cols.insert(1, cols.pop(cols.index('ProvinceState')))
cols.insert(2, cols.pop(cols.index('LastUpdate')))
jhu_df = jhu_df.loc[:, cols]

# Change region to known US states
state_abbrs_dict = {}
for state in us.states.STATES:
    state_abbrs_dict[state.abbr] = state.name

def toState(input_state, mapping):
    abbreviation = input_state.rstrip()[-2:]
    try:
        return_value = mapping[abbreviation]
    except KeyError:
        return_value = input_state
    return return_value

jhu_df['ProvinceState'] = jhu_df['ProvinceState'].apply(lambda x: toState(x, state_abbrs_dict) if x != 'Washington, D.C.' else 'District of Columbia')

# Filter out unknown states
jhu_df = jhu_df[jhu_df['ProvinceState'].isin(all_cases.state.unique().tolist())]

# Merge-sum rows with same date and State
jhu_df = jhu_df.groupby(['LastUpdate', 'ProvinceState']).agg(
    {
        'Active': sum,
        'Confirmed': sum,
        'Deaths': sum,
        'Recovered': sum
    }
).reset_index()

#jhu_df.tail(50)

In [15]:
#Load the Johns Hopkins data
jhu_df.tail(50)

Unnamed: 0,LastUpdate,ProvinceState,Active,Confirmed,Deaths,Recovered
5757,2020-07-01,Alaska,923.0,937.0,14.0,0.0
5758,2020-07-01,Arizona,77583.0,79228.0,1645.0,0.0
5759,2020-07-01,Arkansas,20507.0,20777.0,270.0,0.0
5760,2020-07-01,California,225150.0,231232.0,6082.0,0.0
5761,2020-07-01,Colorado,31008.0,32698.0,1690.0,0.0
5762,2020-07-01,Connecticut,42192.0,46514.0,4322.0,0.0
5763,2020-07-01,Delaware,10965.0,11474.0,509.0,0.0
5764,2020-07-01,District of Columbia,9776.0,10327.0,551.0,0.0
5765,2020-07-01,Florida,148929.0,152434.0,3505.0,0.0
5766,2020-07-01,Georgia,78486.0,81291.0,2805.0,0.0


In [16]:
#hide_input

# Now that we have the JHU dataset relatively cleaned
# we can go ahead and merge its data with our main dataset

for i, row in all_cases.iterrows():
    last_update = all_cases.at[i, 'date']
    state = all_cases.at[i, 'state']
    matching_row = jhu_df[jhu_df['ProvinceState'] == state]
    matching_row = matching_row[matching_row['LastUpdate'] == last_update].reset_index()

    if len(matching_row.values) > 0:
        #all_cases.at[i, 'positive'] = matching_row['Confirmed'].values[0]
        all_cases.at[i, 'active'] = matching_row['Active'].values[0]
        #all_cases.at[i, 'recovered'] = matching_row['Recovered'].values[0]   --- JHU was inconsistent, therefore removed
        #all_cases.at[i, 'death'] = matching_row['Deaths'].values[0]

    # Replace unknown recovery numbers with 0
    if np.isnan(row['recovered']):
        all_cases.at[i, 'recovered'] = 0

    if all_cases.at[i, 'active'] == 0 or np.isnan(row['active']):
        positive = all_cases.at[i, 'positive']
        recovered = all_cases.at[i, 'recovered']
        dead = all_cases.at[i, 'death']
        all_cases.at[i, 'active'] = positive - recovered - dead

#all_cases.tail()

In [17]:
#Grab all historical data and ensure we have the 1st US case.
all_cases.tail()

Unnamed: 0,date,state,abbrev,population,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,...,negativeTestsViral,positiveCasesViral,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade,bedsPerThousand,total_beds
6182,2020-01-26,Washington,WA,7797095,2.0,2.0,,,,,...,,,0,0,0,0,0,,1.7,13255.0615
6183,2020-01-25,Washington,WA,7797095,2.0,2.0,,,,,...,,,0,0,0,0,0,,1.7,13255.0615
6184,2020-01-24,Washington,WA,7797095,2.0,2.0,,,,,...,,,0,0,0,0,0,,1.7,13255.0615
6185,2020-01-23,Washington,WA,7797095,2.0,2.0,,,,,...,,,0,0,0,0,0,,1.7,13255.0615
6186,2020-01-22,Washington,WA,7797095,2.0,2.0,,,,,...,,,0,0,0,0,0,,1.7,13255.0615


In [18]:
#hide_input
# Save formatted dataset offline in case of disaster
dataset_file = 'results/all_cases.csv'
all_cases.to_csv(dataset_file)

In [19]:
#hide_input

# convert date to datetime format
all_cases['date'] = pd.to_datetime(all_cases['date'])

# An Exploratory data analysis of the US dataset.
# Validate data types and data integrity of each row.


The NaN values may indicate that there were too few Covid-19 patients at these date points.
We further analyse the statistical values of the dataset columns to ensure data integrity and accuracy. 

In [None]:
dataset_file = 'results/all_cases.csv'
covid_df = pd.read_csv(dataset_file, index_col=0) 
# convert date to datetime format
covid_df['date'] = pd.to_datetime(covid_df['date'])
covid_df.info()
# set float format to 3 decimals
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
#Validte the data with; mean, standard deviation, min/max quartiles:
covid_df.describe()
# TODO rounding up the numbers

In [None]:
#hide_input

# drop unnecessary columns
covid_cleaned = covid_df.drop(['hospitalized', 'bedsPerThousand'], axis=1)
covid_100k = covid_cleaned.copy()
# list of columns to transform to per 100k
columns_list = ['positive', 'active', 'recovered', 'death', 'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently', 'onVentilatorCurrently', 'total_beds']
# add columns per 100k
for column in columns_list:
    if column == 'total_beds':
        covid_100k['BedsPer100k'.format(column)] = (covid_cleaned.loc[:, column] / covid_cleaned.loc[:, 'population']) * 100000
    else:
        covid_100k['{}_100k'.format(column)] = (covid_cleaned.loc[:, column] / covid_cleaned.loc[:, 'population']) * 100000

covid_100k = covid_100k.drop(columns_list, axis=1)

In [None]:
#hide_input

covid_100k['date'] = pd.to_datetime(covid_100k['date'])
start_date = '2020-04-18'
end_date = '2020-06-19'
mask = (covid_100k['date'] > start_date) & (covid_100k['date'] <= end_date)
covid_100k_last_month = covid_100k.loc[mask]

In [None]:
#hide_input

covid_100k_last_month_part1 =  covid_100k_last_month.groupby('date').sum().loc[:, ['positive_100k','active_100k','recovered_100k','death_100k','hospitalizedCumulative_100k']].diff(periods=1, axis=0)

covid_100k_last_month_part2 = covid_100k_last_month.groupby('date').sum().loc[:, ['inIcuCurrently_100k','onVentilatorCurrently_100k','BedsPer100k']]

final_100k_last_month = covid_100k_last_month_part1.merge(covid_100k_last_month_part2, left_index=True, right_index=True)

In [None]:
#final_100k_last_month.head()

In [None]:
#Review the out for per capita measures:
final_100k_last_month.describe()

In [None]:
#hide_input

# save description cleaned dataset to csv
describe_file = 'results/final_100k_last_month.csv'
final_100k_last_month.describe().to_csv(describe_file)

## Graphical Exploratory Analysis.

Plotting histograms, scatterplots and boxplots to assess the distribution of the entire US dataset. 

In [None]:
#hide_input

# Omitting the categorical (states/abbreviations) and time columns 
# There must be an easier way for you, but this was the easiest way I could think of
covid_cleaned['date'] = pd.to_datetime(covid_cleaned['date'])
# mask data for last month
start_date = '2020-04-18'
end_date = '2020-05-19'
mask = (covid_cleaned['date'] > start_date) & (covid_cleaned['date'] <= end_date)
covid_cleaned_last_month = covid_cleaned.loc[mask]
plot_df = covid_cleaned_last_month[['population', 'active', 'recovered', 'death', 'hospitalizedCurrently', 'inIcuCurrently', 'onVentilatorCurrently', 'total_beds']]
plot_df_last_month = covid_100k_last_month[['population', 'active_100k', 'recovered_100k', 'death_100k', 'hospitalizedCurrently_100k', 'inIcuCurrently_100k', 'onVentilatorCurrently_100k', 'BedsPer100k']]

In [None]:
#hide_input

timeseries_usa_df = covid_100k.loc[:, ['date', 'positive_100k', 'active_100k', 'recovered_100k', 'death_100k', 'hospitalizedCurrently_100k', 'inIcuCurrently_100k', 'onVentilatorCurrently_100k', 'BedsPer100k']].groupby('date').sum().reset_index()
# timeseries_usa_df['log_positive'] = np.log(timeseries_usa_df['positive_100k'])
# timeseries_usa_df['log_active'] = np.log(timeseries_usa_df['active_100k'])
# timeseries_usa_df['log_recovered'] = np.log(timeseries_usa_df['recovered_100k'])
# timeseries_usa_df['log_death'] = np.log(timeseries_usa_df['death_100k'])

In [None]:
#Validate all US data:
timeseries_usa_df.tail()

In [None]:
#hide_input

# get data from last day
# plot_df_last_date = plot_df.loc[covid_df['date'] == '2020-05-18'] 

# Plotting histograms to gain insight of the distribution shape, skewness and scale
fig, axs = plt.subplots(4,2,figsize = (16, 16))
sns.set()
for i, column in enumerate(plot_df_last_month.columns):
    if (i + 1) % 2 == 0:
        ax = axs[(i//2), 1]
    else:
        ax = axs[(i//2), 0]
    sns.distplot(plot_df_last_month[column], fit=norm, fit_kws=dict(label='normality'), hist_kws=dict(color='plum', edgecolor='k', linewidth=1, label='frequency'), ax=ax, color='#9d53ad')
    ax.legend(loc='upper right')
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input

# Looking at linearity and variance with scatterplots
# Removing the target variable and saving it in another df
target = plot_df.hospitalizedCurrently
indep_var = plot_df.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var.columns):
    ax=fig.add_subplot(4, 3, i+1) 
    sns.regplot(x=indep_var[col], y=target, data=indep_var, label=col, scatter_kws={'s':10}, line_kws={"color": "plum", 'label': 'hospitCurr'})
    plt.suptitle('Scatterplots with Target Hospitalized Patients Showing Growth Trajectories', fontsize=23)
    plt.legend()
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input

# Assessing the normality of the distribution with a boxplot
# Boxplot with removed outliers
fig, ax = plt.subplots(figsize = (16, 12))
for i, col in enumerate(plot_df.columns):
    ax=fig.add_subplot(4, 3, i+1) 
    sns.boxplot(x=plot_df[col], data=plot_df, color='lightblue', showfliers=False)
    plt.suptitle('Boxplots of Independent Variables', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

## Analysis of Hospitalizations by State.

#### Red data plots are Republican Governed States.  Blue data plots are Democratic Governed States.  

### Alabama

In [None]:
#hide_input
bama = covid_df.loc[(covid_df['abbrev'] == 'AL') & (covid_df['state']== 'Alabama')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(bama.date, bama.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Alabama Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(bama.date, bama.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Alabama', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(bama.date, bama.hospitalizedCurrently/bama.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Alabama', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(bama.date, bama.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in Alabama', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
bama[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
bama.head()

# Omit the NaN cols
bama = bama[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_al = bama.hospitalizedCurrently
indep_var_al = bama.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_al.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_al[col], y=target_al, data=indep_var_al, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Alabama', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endalabama

### Arizona

In [None]:
#hide_input
arizona = covid_df.loc[(covid_df['abbrev'] == 'AZ') & (covid_df['state']== 'Arizona')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(arizona.date, arizona.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Arizona Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(arizona.date, arizona.active, linewidth=4.7, color='r')
plt.title('Number of Active Cases in Arizona Currently', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(arizona.date, arizona.active, linewidth=4.7, color='r')
plt.title('Number of Active Cases in Arizona Currently', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(arizona.date, arizona.hospitalizedCurrently/arizona.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate of Acive Patients in Arizona', fontsize=23)
plt.xlabel('Date')
plt.ylabel('% Positive Cases in Hospital')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(arizona.date, arizona.onVentilatorCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Arizona Currently on Ventilator', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Checking which cols have NaN values
arizona[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
arizona.head()

# Omit the NaN cols
arizona = arizona[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AZ
# Split dependent var from independent variables
target_az = arizona.hospitalizedCurrently
indep_var_az = arizona.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_az.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_az[col], y=target_az, data=indep_var_az, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Arizona', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endarizona

### Arkansas

In [None]:
#hide_input
arkansas = covid_df.loc[(covid_df['abbrev'] == 'AR') & (covid_df['state']== 'Arkansas')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(arkansas.date, arkansas.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Arkansas Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(arkansas.date, arkansas.onVentilatorCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Arkansas Currently on a Ventilator', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(arkansas.date, arkansas.hospitalizedCurrently/arkansas.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Arkansas', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Checking which cols have NaN values
arkansas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
arkansas.head()

# Omit the NaN cols
arkansas = arkansas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AK
# Split dependent var from independent variables
target_ak = arkansas.hospitalizedCurrently
indep_var_ak = arkansas.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ak.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ak[col], y=target_ak, data=indep_var_ak, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Arkansas', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endarkansas

### California

In [None]:
#hide_input
cali = covid_df.loc[(covid_df['abbrev'] == 'CA') & (covid_df['state']== 'California')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(cali.date, cali.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in California Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(cali.date, cali.inIcuCurrently, linewidth=4.7)
plt.title('Number of Patients in California Currently in ICU', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(cali.date, cali.hospitalizedCurrently/cali.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in California ', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(cali.date, cali.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in CA Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Checking which cols have NaN values
cali[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
cali.head()

# Omit the NaN cols
cali = cali[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots CA
# Split dependent var from independent variables
target_ca = cali.hospitalizedCurrently
indep_var_ca = cali.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ca.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ca[col], y=target_ca, data=indep_var_ca, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables CA', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endcali

### Colorado

In [None]:
#hide_input
colorado = covid_df.loc[(covid_df['abbrev'] == 'CO') & (covid_df['state']== 'Colorado')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(colorado.date, colorado.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Colorado Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(colorado.date, colorado.death, linewidth=4.7)
plt.title('Number of Cummulative Deaths in Colorado', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(colorado.date, colorado.recovered, linewidth=4.7)
plt.title('Number of Cummulative Recoveries in Colorado', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
colorado[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
colorado.head()

# Omit the NaN cols
colorado = colorado[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_co = colorado.hospitalizedCurrently
indep_var_co = colorado.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_co.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_co[col], y=target_co, data=indep_var_co, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Colorado', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endcolorado

### Connecticut

In [None]:
#hide_input
conn = covid_df.loc[(covid_df['abbrev'] == 'CT') & (covid_df['state']== 'Connecticut')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(conn.date, conn.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Connecticut Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(conn.date, conn.death, linewidth=4.7)
plt.title('Number of Cummulative Deaths in Connecticut', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(conn.date, conn.recovered, linewidth=4.7)
plt.title('Number of Cummulative Recoveries in Connecticut', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
conn[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
conn.head()

# Omit the NaN cols
conn = conn[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots CT
# Split dependent var from independent variables
target_ct = conn.hospitalizedCurrently
indep_var_ct = conn.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_al.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ct[col], y=target_ct, data=indep_var_ct, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Connecticut', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endconnecticut

### Delaware

In [None]:
#hide_input
delaware = covid_df.loc[(covid_df['abbrev'] == 'DE') & (covid_df['state']== 'Delaware')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(delaware.date, delaware.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Delaware Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(delaware.date, delaware.death, linewidth=4.7)
plt.title('Number of Cummulative Deaths Delaware', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
delaware[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
delaware.head()

# Omit the NaN cols
delaware = delaware[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots DE
# Split dependent var from independent variables
target_de = delaware.hospitalizedCurrently
indep_var_de = delaware.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_al.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_de[col], y=target_de, data=indep_var_de, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Delaware', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###enddelaware

### Florida

In [None]:
#hide_input
#hide_input
fl = covid_df.loc[(covid_df['abbrev'] == 'FL') & (covid_df['state']== 'Florida')] 


In [None]:
#hide_input
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(fl.date, fl.hospitalizedCumulative, linewidth=4.7, color='r')
plt.title('Cummulative Number of Patients in Florida  Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(fl.date, fl.totalTestsViral, linewidth=4.7, color='r')
plt.title('Cummulative Number of Viral Tests in Florida', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(fl.date, fl.positiveTestsViral, linewidth=4.7, color='r')
plt.title('Cummulative Number of Positive Viral Tests in Florida', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(fl.date, fl.positiveTestsViral/fl.totalTestsViral*100, linewidth=4.7, color='r')
plt.title('Viral Infection Rate in Florida', fontsize=23)
plt.xlabel('Date')
plt.ylabel('% Infected')

In [None]:
#hide_input
###endflorida

### Georgia

In [None]:
#hide_input
georgia = covid_df.loc[(covid_df['abbrev'] == 'GA') & (covid_df['state']== 'Georgia')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(georgia.date, georgia.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Georgia Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(georgia.date, georgia.totalTestsViral, linewidth=4.7, color='r')
plt.title('Number of Cummulative Viral Tests in Georgia', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(georgia.date, georgia.positiveTestsViral, linewidth=4.7, color='r')
plt.title('Number of Cummulative Positive Viral Tests in Georgia', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(georgia.date, georgia.positiveTestsViral/georgia.totalTestsViral*100, linewidth=4.7, color='r')
plt.title('Infection Rate in Georgia', fontsize=23)
plt.xlabel('Date')
plt.ylabel('% Infection Rate')

In [None]:
#hide_input
# Checking which cols have NaN values
georgia[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
georgia.head()

# Omit the NaN cols
georgia = georgia[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots GA
# Split dependent var from independent variables
target_ga = georgia.hospitalizedCurrently
indep_var_ga = georgia.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ga.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ga[col], y=target_ga, data=indep_var_ga, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Georgia', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endgeorgia

### Hawaii

In [None]:
#hide_input
hawaii = covid_df.loc[(covid_df['abbrev'] == 'HI') & (covid_df['state']== 'Hawaii')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(hawaii.date, hawaii.positive, linewidth=4.7)
plt.title('Number of Positive Patients in Hawaii', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(hawaii.date, hawaii.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Hawaii', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(hawaii.date, hawaii.recovered, linewidth=4.7)
plt.title('Cummulative Number Recovered in Hawaii', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(hawaii.date, hawaii.totalTestsViral, linewidth=4.7)
plt.title('Cummulative Number of Viral Tests in Hawaii', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(hawaii.date, hawaii.positiveTestsViral, linewidth=4.7)
plt.title('Cummulative Number Positive Tests in Hawaii', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(hawaii.date, hawaii.positiveTestsViral/hawaii.totalTestsViral*100, linewidth=4.7)
plt.title('Viral Infection Rate in Hawaii', fontsize=23)
plt.xlabel('Date')
plt.ylabel('% Infected')

In [None]:
#hide_input
###endalabama

### Idaho

In [None]:
#hide_input
idaho = covid_df.loc[(covid_df['abbrev'] == 'ID') & (covid_df['state']== 'Idaho')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(idaho.date, idaho.hospitalized, linewidth=4.7, color='r')
plt.title('Cummulative Number of Hospitalized Patients in Idaho', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(idaho.date, idaho.death, linewidth=4.7, color='r')
plt.title('Cummulative Number of Killed in Idaho', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(idaho.date, idaho.recovered, linewidth=4.7, color='r')
plt.title('Cummulative Number of Recovered Patients in Idaho', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
###endidaho

### Iowa

In [None]:
#hide_input
iowa = covid_df.loc[(covid_df['abbrev'] == 'IA') & (covid_df['state']== 'Iowa')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(iowa.date, iowa.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Iowa Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(iowa.date, iowa.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Iowa', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(iowa.date, iowa.hospitalizedCurrently/iowa.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Iowa', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(iowa.date, iowa.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in Iowa', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
iowa[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
iowa.head()

# Omit the NaN cols
iowa = iowa[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_ia = iowa.hospitalizedCurrently
indep_var_ia = iowa.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_al.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ia[col], y=target_ia, data=indep_var_ia, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Iowa', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endiowa

### Kansas

In [None]:
#hide_input
kansas = covid_df.loc[(covid_df['abbrev'] == 'KS') & (covid_df['state']== 'Kansas')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(kansas.date, kansas.hospitalizedCumulative, linewidth=4.7, color='r')
plt.title('Cummulative Number of Patients in Kansas Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(kansas.date, kansas.positiveCasesViral, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Kansas', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
###endkansas

### Kentucky

In [None]:
#hide_input
kentucky = covid_df.loc[(covid_df['abbrev'] == 'KY') & (covid_df['state']== 'Kentucky')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(kentucky.date, kentucky.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Kentucky Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(kentucky.date, kentucky.inIcuCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Kentucky Currently in ICU', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(kentucky.date, kentucky.positiveCasesViral, linewidth=4.7, color='r')
plt.title('Number of Cummulative Positve Cases in Kentucky', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Checking which cols have NaN values
kentucky[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
kentucky.head()

# Omit the NaN cols
kentucky = kentucky[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_ky = kentucky.hospitalizedCurrently
indep_var_ky = kentucky.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ky.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ky[col], y=target_ky, data=indep_var_ky, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Kentuky', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endkentucky

### Louisiana

In [None]:
#hide_input
louisiana = covid_df.loc[(covid_df['abbrev'] == 'LA') & (covid_df['state']== 'Louisiana')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(louisiana.date, louisiana.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Louisiana Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(louisiana.date, louisiana.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in Louisiana', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(louisiana.date, louisiana.hospitalizedCurrently/louisiana.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in Louisiana', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(louisiana.date, louisiana.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Louisiana', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
louisiana[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
louisiana.head()

# Omit the NaN cols
louisiana = louisiana[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_la = louisiana.hospitalizedCurrently
indep_var_la = louisiana.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_la.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_la[col], y=target_la, data=indep_var_la, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Louisiana', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endlouisiana

### Maine

In [None]:
#hide_input
maine = covid_df.loc[(covid_df['abbrev'] == 'ME') & (covid_df['state']== 'Maine')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(maine.date, maine.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Maine Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(maine.date, maine.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in Maine', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(maine.date, maine.hospitalizedCurrently/maine.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in Maine', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(maine.date, maine.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Maine', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
maine[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
maine.head()

# Omit the NaN cols
maine = maine[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_me = maine.hospitalizedCurrently
indep_var_me = maine.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_me.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_me[col], y=target_me, data=indep_var_me, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Maine', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endmaine

### Maryland

In [None]:
#hide_input
maryland = covid_df.loc[(covid_df['abbrev'] == 'MD') & (covid_df['state']== 'Maryland')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(maryland.date, maryland.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Maryland Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(maryland.date, maryland.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Maryland', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(maryland.date, maryland.inIcuCurrently, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Maryland', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(maryland.date, maryland.hospitalizedCurrently/maryland.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Maryland', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(maryland.date, maryland.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in Maryland', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
maryland[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
maryland.head()

# Omit the NaN cols
maryland = maryland[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_md = maryland.hospitalizedCurrently
indep_var_md = maryland.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_al.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_md[col], y=target_md, data=indep_var_md, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Maryland', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endmaryland

### Massachusetts

In [None]:
#hide_input
mass = covid_df.loc[(covid_df['abbrev'] == 'MA') & (covid_df['state']== 'Massachusetts')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mass.date, mass.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Massachusetts Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mass.date, mass.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Massachusetts', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mass.date, mass.hospitalizedCurrently/mass.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Massachusetts', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mass.date, mass.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in Massachusetts', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
mass[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
mass.head()

# Omit the NaN cols
mass = mass[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_ma = mass.hospitalizedCurrently
indep_var_ma = mass.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ma.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ma[col], y=target_ma, data=indep_var_ma, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Massachusetts', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endmass

### Michigan

In [None]:
#hide_input
mich = covid_df.loc[(covid_df['abbrev'] == 'MI') & (covid_df['state']== 'Michigan')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mich.date, mich.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Michigan Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mich.date, mich.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in Michigan', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mich.date, mich.hospitalizedCurrently/mich.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in Michigan', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mich.date, mich.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Michigan', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
mich[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
mich.head()

# Omit the NaN cols
mich = mich[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_mi = mich.hospitalizedCurrently
indep_var_mi = mich.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_mi.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_mi[col], y=target_mi, data=indep_var_mi, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Michigan', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endmichigan

### Minnesota

In [None]:
#hide_input
minn = covid_df.loc[(covid_df['abbrev'] == 'MN') & (covid_df['state']== 'Minnesota')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(minn.date, minn.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Minnesota Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(minn.date, minn.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in Minnesota', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(minn.date, minn.hospitalizedCurrently/minn.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in Minnesota', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(minn.date, minn.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Minnesota', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
minn[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
minn.head()

# Omit the NaN cols
minn = minn[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_mn = minn.hospitalizedCurrently
indep_var_mn = minn.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_mn.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_mn[col], y=target_mn, data=indep_var_mn, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Minnesota', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endminnesota

### Mississippi

In [None]:
#hide_input
miss = covid_df.loc[(covid_df['abbrev'] == 'MS') & (covid_df['state']== 'Mississippi')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(miss.date, miss.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Mississippi Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(miss.date, miss.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Mississippi', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(miss.date, miss.hospitalizedCurrently/miss.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Mississippi', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(miss.date, miss.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in Mississippi', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
miss[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
miss.head()

# Omit the NaN cols
miss = miss[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots MS
# Split dependent var from independent variables
target_ms = miss.hospitalizedCurrently
indep_var_ms = miss.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ms.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ms[col], y=target_ms, data=indep_var_ms, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Mississippi', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endmississippi

### Missouri

In [None]:
#hide_input
mo = covid_df.loc[(covid_df['abbrev'] == 'MO') & (covid_df['state']== 'Missouri')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mo.date, mo.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Missouri Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mo.date, mo.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Missouri', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mo.date, mo.hospitalizedCurrently/mo.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Missouri', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mo.date, mo.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in Missouri', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
mo[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
mo.head()

# Omit the NaN cols
mo = mo[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_mo = mo.hospitalizedCurrently
indep_var_mo = mo.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_mo.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_mo[col], y=target_mo, data=indep_var_mo, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Missouri', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endalabama

### Montana

In [None]:
#hide_input
mt = covid_df.loc[(covid_df['abbrev'] == 'MT') & (covid_df['state']== 'Montana')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mt.date, mt.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Montana Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mt.date, mt.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in Montana', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mt.date, mt.hospitalizedCurrently/mt.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in Montana', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mt.date, mt.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Montana', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
mt[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
mt.head()

# Omit the NaN cols
mt = mt[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_mt = mt.hospitalizedCurrently
indep_var_mt = mt.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_mt.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_mt[col], y=target_mt, data=indep_var_mt, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Montana', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endmontana

### Nebraska

In [None]:
#hide_input
ne = covid_df.loc[(covid_df['abbrev'] == 'NE') & (covid_df['state']== 'Nebraska')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(ne.date, ne.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Nebraska Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(ne.date, ne.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Nebraska', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(ne.date, ne.hospitalizedCurrently/ne.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Nebraska', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(ne.date, ne.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in Nebraska', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
ne[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
ne.head()

# Omit the NaN cols
ne = ne[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_ne = ne.hospitalizedCurrently
indep_var_ne = ne.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ne.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ne[col], y=target_ne, data=indep_var_ne, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Nebraska', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endnebraska

### Nevada:

In [None]:
#hide_input
nevada = covid_df.loc[(covid_df['abbrev'] == 'NV') & (covid_df['state']== 'Nevada')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nevada.date, nevada.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Nevada Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nevada.date, nevada.inIcuCurrently, linewidth=4.7)
plt.title('Number of Patients in Nevada Currently Hospitalized in ICU', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nevada.date, nevada.onVentilatorCurrently, linewidth=4.7)
plt.title('Number of Patients in Nevada Currently on a Ventilator', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Checking which cols have NaN values
nevada[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
nevada.head()

# Omit the NaN cols
nevada = nevada[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots NV
# Split dependent var from independent variables
target_nv = nevada.hospitalizedCurrently
indep_var_nv = nevada.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_nv.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_nv[col], y=target_nv, data=indep_var_nv, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Nevada', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endnevada

### New Hampshire

In [None]:
#hide_input
nh = covid_df.loc[(covid_df['abbrev'] == 'NH') & (covid_df['state']== 'New Hampshire')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nh.date, nh.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in New Hampshire Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nh.date, nh.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in New Hampshire', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nh.date, nh.hospitalizedCurrently/nh.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in New Hampshire', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nh.date, nh.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in New Hampshire', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
nh[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
nh.head()

# Omit the NaN cols
nh = nh[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots NH
# Split dependent var from independent variables
target_nh = nh.hospitalizedCurrently
indep_var_nh = nh.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_nh.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_nh[col], y=target_nh, data=indep_var_nh, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables New Hampshire', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endnewhampshire

### New Jersey

In [None]:
#hide_input
nj = covid_df.loc[(covid_df['abbrev'] == 'NJ') & (covid_df['state']== 'New Jersey')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nj.date, nj.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in New Jersey Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nj.date, nj.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in New Jersey', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nj.date, nj.hospitalizedCurrently/nj.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in New Jersey', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nj.date, nj.death, linewidth=4.7)
plt.title('Cummulative Number Killed in New Jersey', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
nj[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
nj.head()

# Omit the NaN cols
nj = nj[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_nj = nj.hospitalizedCurrently
indep_var_nj = nj.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_nj.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_nj[col], y=target_nj, data=indep_var_nj, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables New Jersey', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endnewjersey

### New Mexico

In [None]:
#hide_input
nm = covid_df.loc[(covid_df['abbrev'] == 'NM') & (covid_df['state']== 'New Mexico')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nm.date, nm.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in New Mexico Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nm.date, nm.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in New Mexico', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nm.date, nm.hospitalizedCurrently/nm.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in New Mexico', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nm.date, nm.death, linewidth=4.7)
plt.title('Cummulative Number Killed in New Mexico', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
nm[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
nm.head()

# Omit the NaN cols
nm = nm[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots NM
# Split dependent var from independent variables
target_nm = nm.hospitalizedCurrently
indep_var_nm = nm.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_nm.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_nm[col], y=target_nm, data=indep_var_nm, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables New Mexico', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endnewmexico

### New York

In [None]:
#hide_input
# Split covid_df into subset with only NY values
new_york = covid_df.loc[covid_df['abbrev'] == 'NY'] 
fig, ax = plt.subplots(figsize = (16, 12))
# Timeseries plt
plt.plot(new_york.date, new_york.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in New York Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Split covid_df into subset with only NY values
new_york = covid_df.loc[covid_df['abbrev'] == 'NY'] 
fig, ax = plt.subplots(figsize = (16, 12))
# Timeseries plt
plt.plot(new_york.date, new_york.inIcuCurrently, linewidth=4.7)
plt.title('Number of Patients in New York Currently in ICU', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Split covid_df into subset with only NY values
new_york = covid_df.loc[covid_df['abbrev'] == 'NY'] 
fig, ax = plt.subplots(figsize = (16, 12))
# Timeseries plt
plt.plot(new_york.date, new_york.onVentilatorCurrently, linewidth=4.7)
plt.title('Number of Patients in New York Currently on a Ventilator', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

#hide_input
# Split covid_df into subset with only NY values
new_york = covid_df.loc[covid_df['abbrev'] == 'NY'] 
fig, ax = plt.subplots(figsize = (16, 12))
# Timeseries plt
plt.plot(new_york.date, new_york.recovered, linewidth=4.7)
plt.title('Number of Recoveries in New York', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Split covid_df into subset with only NY values
new_york = covid_df.loc[covid_df['abbrev'] == 'NY'] 
fig, ax = plt.subplots(figsize = (16, 12))
# Timeseries plt
plt.plot(new_york.date, new_york.onVentilatorCurrently, linewidth=4.7)
plt.title('Number of Patients in New York Currently on a Ventilator', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

#hide_input
# Split covid_df into subset with only NY values
new_york = covid_df.loc[covid_df['abbrev'] == 'NY'] 
fig, ax = plt.subplots(figsize = (16, 12))
# Timeseries plt
plt.plot(new_york.date, new_york.recovered, linewidth=4.7)
plt.title('Number of Recoveries in New York', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Omit the categorical and date cols 
new_york = new_york[['positive', 'active', 'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]

In [None]:
#hide_input
# Checking which cols have NaN values
new_york[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
new_york.head()

# Omit the NaN cols
new_york = new_york[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_ny = new_york.hospitalizedCurrently
indep_var_ny = new_york.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ny.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ny[col], y=target_ny, data=indep_var_ny, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables New York', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

### North Carolina

In [None]:
#hide_input
nc = covid_df.loc[(covid_df['abbrev'] == 'NC') & (covid_df['state']== 'North Carolina')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nc.date, nc.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in North Carolina Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nc.date, nc.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in North Carolina', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nc.date, nc.hospitalizedCurrently/nc.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in North Carolina', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nc.date, nc.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in North Carolina', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
nc[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
nc.head()

# Omit the NaN cols
nc = nc[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots NC
# Split dependent var from independent variables
target_nc = nc.hospitalizedCurrently
indep_var_nc = nc.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_nc.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_nc[col], y=target_nc, data=indep_var_nc, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables North Carolina', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endnorthcarolina

### Ohio

In [None]:
#hide_input
oh = covid_df.loc[(covid_df['abbrev'] == 'OH') & (covid_df['state']== 'Ohio')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(oh.date, oh.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Ohio Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(oh.date, oh.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Ohio', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(oh.date, oh.hospitalizedCurrently/oh.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Ohio', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(oh.date, oh.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in Ohio', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
oh[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
oh.head()

# Omit the NaN cols
oh = oh[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_oh = oh.hospitalizedCurrently
indep_var_oh = oh.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_oh.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_oh[col], y=target_oh, data=indep_var_oh, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Ohio', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endohio

### Oklahoma

In [None]:
#hide_input
oklahoma = covid_df.loc[(covid_df['abbrev'] == 'OK') & (covid_df['state']== 'Oklahoma')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(oklahoma.date, oklahoma.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Oklahoma Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Checking which cols have NaN values
oklahoma[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
oklahoma.head()

# Omit the NaN cols
oklahoma = oklahoma[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots OK
# Split dependent var from independent variables
target_ok = oklahoma.hospitalizedCurrently
indep_var_ok = oklahoma.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ok.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ok[col], y=target_ok, data=indep_var_ok, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Oklahoma', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endoklahoma

### Oregon

In [None]:
#hide_input
oregon = covid_df.loc[(covid_df['abbrev'] == 'OR') & (covid_df['state']== 'Oregon')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(oregon.date, oregon.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Oregon Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(oregon.date, oregon.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in Oregon', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(oregon.date, oregon.hospitalizedCurrently/oregon.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in Oregon', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(oregon.date, oregon.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Oregon', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
oregon[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
oregon.head()

# Omit the NaN cols
oregon = oregon[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots OR
# Split dependent var from independent variables
target_oregon = oregon.hospitalizedCurrently
indep_var_oregon = oregon.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_oregon.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_oregon[col], y=target_oregon, data=indep_var_oregon, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Oregon', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endoregon

### Pennsylvania

In [None]:
#hide_input
pa = covid_df.loc[(covid_df['abbrev'] == 'PA') & (covid_df['state']== 'Pennsylvania')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(pa.date, pa.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Pennsylvania Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(pa.date, pa.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in Pennsylvania', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(pa.date, pa.hospitalizedCurrently/pa.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in Pennsylvania', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(pa.date, pa.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Pennsylvania', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
pa[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
pa.head()

# Omit the NaN cols
pa = pa[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots PA
# Split dependent var from independent variables
target_pa = pa.hospitalizedCurrently
indep_var_pa = pa.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_pa.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_pa[col], y=target_pa, data=indep_var_pa, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Pennsylvania', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endpennsylvania

### Rhode Island

In [None]:
#hide_input
ri = covid_df.loc[(covid_df['abbrev'] == 'RI') & (covid_df['state']== 'Rhode Island')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(ri.date, ri.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Rhode Island Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(ri.date, ri.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in Rhode Island', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(ri.date, ri.hospitalizedCurrently/ri.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in Rhode Island', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(ri.date, ri.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Rhode Island', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
ri[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
ri.head()

# Omit the NaN cols
ri = ri[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots RI
# Split dependent var from independent variables
target_ri = ri.hospitalizedCurrently
indep_var_ri = ri.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ri.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ri[col], y=target_ri, data=indep_var_ri, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Rhode Island', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endrhodeisland

### South Carolina

In [None]:
#hide_input
sc = covid_df.loc[(covid_df['abbrev'] == 'SC') & (covid_df['state']== 'South Carolina')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(sc.date, sc.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in South Carolina Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(sc.date, sc.death, linewidth=4.7, color='r')
plt.title('Number of Cummulative Deaths in South Carolina', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(sc.date, sc.totalTestsViral, linewidth=4.7, color='r')
plt.title('Number of Cummulative Viral Tests in South Carolina', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(sc.date, sc.positiveTestsViral, linewidth=4.7, color='r')
plt.title('Number of Cummulative Positive Viral Tests in South Carolina', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(sc.date, sc.positiveTestsViral/sc.totalTestsViral*100, linewidth=4.7, color='r')
plt.title('Viral Infection Rate in South Carolina', fontsize=23)
plt.xlabel('Date')
plt.ylabel('% Infection Rate')

In [None]:
#hide_input
# Checking which cols have NaN values
sc[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
sc.head()

# Omit the NaN cols
sc = sc[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots SC
# Split dependent var from independent variables
target_sc = sc.hospitalizedCurrently
indep_var_sc = sc.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_sc.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_sc[col], y=target_sc, data=indep_var_sc, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables South Carolina', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endsouthcarolina

### South Dakota

In [None]:
#hide_input
sd = covid_df.loc[(covid_df['abbrev'] == 'SD') & (covid_df['state']== 'South Dakota')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(sd.date, sd.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in South Dakota Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(sd.date, sd.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in South Dakota', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(sd.date, sd.hospitalizedCurrently/sd.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in South Dakota', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(sd.date, sd.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in South Dakota', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
sd[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
sd.head()

# Omit the NaN cols
sd = sd[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots SD
# Split dependent var from independent variables
target_sd = sd.hospitalizedCurrently
indep_var_sd = sd.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_sd.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_sd[col], y=target_sd, data=indep_var_sd, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables South Dakota', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endsouthdakota

### Tennessee

In [None]:
#hide_input
tn = covid_df.loc[(covid_df['abbrev'] == 'TN') & (covid_df['state']== 'Tennessee')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(tn.date, tn.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Tennessee Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(tn.date, tn.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Tennessee', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(tn.date, tn.hospitalizedCurrently/tn.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Tennessee', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(tn.date, tn.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in Tennessee', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
tn[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
tn.head()

# Omit the NaN cols
tn = tn[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_tn = tn.hospitalizedCurrently
indep_var_tn = tn.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_tn.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_tn[col], y=target_tn, data=indep_var_tn, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Tennessee', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endtennessee

### Texas

In [None]:
#hide_input
texas = covid_df.loc[(covid_df['abbrev'] == 'TX') & (covid_df['state']== 'Texas')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(texas.date, texas.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Texas Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(texas.date, texas.death, linewidth=4.7, color='r')
plt.title('Number of Cummulative Deaths in Texas', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(texas.date, texas.totalTestsViral, linewidth=4.7, color='r')
plt.title('Number of Cummulative Viral Tests in Texas', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Checking which cols have NaN values
texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
texas.head()

# Omit the NaN cols
texas = texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots TX
# Split dependent var from independent variables
target_tx = texas.hospitalizedCurrently
indep_var_tx = texas.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_tx.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_tx[col], y=target_tx, data=indep_var_tx, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables TX', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endtx

### Utah

In [None]:
#hide_input
utah = covid_df.loc[(covid_df['abbrev'] == 'UT') & (covid_df['state']== 'Utah')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(utah.date, utah.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in UT Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(utah.date, utah.inIcuCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in UT Currently in ICU', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(utah.date, utah.death, linewidth=4.7, color='r')
plt.title('Number of Cummulative Deaths in Utah', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# Checking which cols have NaN values
utah[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
utah.head()

# Omit the NaN cols
utah = utah[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots UT
# Split dependent var from independent variables
target_ut = utah.hospitalizedCurrently
indep_var_ut = utah.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_tx.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ut[col], y=target_ut, data=indep_var_ut, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Utah', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endutah

### Vermont

In [None]:
#hide_input
vt = covid_df.loc[(covid_df['abbrev'] == 'VT') & (covid_df['state']== 'Vermont')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(vt.date, vt.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Vermont Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(vt.date, vt.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Vermont', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(vt.date, vt.hospitalizedCurrently/vt.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Vermont', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(vt.date, vt.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in Vermont', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
vt[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
vt.head()

# Omit the NaN cols
vt = vt[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_vt = vt.hospitalizedCurrently
indep_var_vt = vt.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_vt.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_vt[col], y=target_vt, data=indep_var_vt, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Vermont', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endalabama

### Virginia

In [None]:
#hide_input
va = covid_df.loc[(covid_df['abbrev'] == 'VA') & (covid_df['state']== 'Virginia')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(va.date, va.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Virginia Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(va.date, va.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in Virginia', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(va.date, va.hospitalizedCurrently/va.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in Virginia', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(va.date, va.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Virginia', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
va[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
va.head()

# Omit the NaN cols
va = va[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots VA
# Split dependent var from independent variables
target_va = va.hospitalizedCurrently
indep_var_va = va.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_va.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_va[col], y=target_va, data=indep_var_va, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Virginia', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endvirginia

### Washington

In [None]:
#hide_input
wa = covid_df.loc[(covid_df['abbrev'] == 'WA') & (covid_df['state']== 'Washington')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wa.date, wa.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Washington Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wa.date, wa.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in Washington', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wa.date, wa.hospitalizedCurrently/wa.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in Washington', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wa.date, wa.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Washington', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
wa[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
wa.head()

# Omit the NaN cols
wa = wa[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_wa = wa.hospitalizedCurrently
indep_var_wa = wa.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_wa.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_wa[col], y=target_wa, data=indep_var_wa, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Washington', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endalabama

### West Virginia

In [None]:
#hide_input
wv = covid_df.loc[(covid_df['abbrev'] == 'WV') & (covid_df['state']== 'West Virginia')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wv.date, wv.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in West Virginia Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wv.date, wv.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in West Virginia', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wv.date, wv.hospitalizedCurrently/wv.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in West Virginia', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wv.date, wv.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in West Virginia', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
wv[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
wv.head()

# Omit the NaN cols
wv = wv[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_wv = wv.hospitalizedCurrently
indep_var_wv = wv.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_wv.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_wv[col], y=target_wv, data=indep_var_wv, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables West Virginia', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endwestvirginia

### Wisconsin

In [None]:
#hide_input
wi = covid_df.loc[(covid_df['abbrev'] == 'WI') & (covid_df['state']== 'Wisconsin')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wi.date, wi.hospitalizedCurrently, linewidth=4.7)
plt.title('Number of Patients in Wisconsin Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wi.date, wi.active, linewidth=4.7)
plt.title('Number of Active Positive Cases in Wisconsin', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wi.date, wi.hospitalizedCurrently/wi.active*100, linewidth=4.7)
plt.title('Hospitalization Rate in Wisconsin', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wi.date, wi.death, linewidth=4.7)
plt.title('Cummulative Number Killed in Wisconsin', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
wi[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
wi.head()

# Omit the NaN cols
wi = wi[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_wi = wi.hospitalizedCurrently
indep_var_wi = wi.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_wi.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_wi[col], y=target_wi, data=indep_var_wi, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Wisconsin', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endwisconsin

### Wyoming

In [None]:
#hide_input
wy = covid_df.loc[(covid_df['abbrev'] == 'WY') & (covid_df['state']== 'Wyoming')] 


In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wy.date, wy.hospitalizedCurrently, linewidth=4.7, color='r')
plt.title('Number of Patients in Wyoming Currently Hospitalized', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wy.date, wy.active, linewidth=4.7, color='r')
plt.title('Number of Active Positive Cases in Wyoming', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wy.date, wy.hospitalizedCurrently/wy.active*100, linewidth=4.7, color='r')
plt.title('Hospitalization Rate in Wyoming', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')

In [None]:
#hide_input
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(wy.date, wy.death, linewidth=4.7, color='r')
plt.title('Cummulative Number Killed in Wyoming', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Killed')

In [None]:
#hide_input
# Checking which cols have NaN values
wy[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
wy.head()

# Omit the NaN cols
wy = wy[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]

In [None]:
#hide_input
# Scatter plots AL
# Split dependent var from independent variables
target_wy = wy.hospitalizedCurrently
indep_var_wy = wy.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_wy.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_wy[col], y=target_wy, data=indep_var_wy, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables Wyoming', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

In [None]:
#hide_input
###endwyoming

## Assessing Correlation of Independent Variables.

In [None]:
#hide_input
# TODO add some explanation / look more into collinear variables

In [None]:
#hide_input
# Heatmap of correlations
# Save correlations to variable
corr = covid_cleaned.corr(method='pearson')
# We can create a mask to not show duplicate values
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(16,16))

# Generate heatmap
sns.heatmap(corr, annot=True, mask=mask, cmap='GnBu', center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

## Build Model for Dependent Variables
- To be used to predict future projections


In [None]:
#hide_input
# We compare three models:
# - Polynomial Regression
# - Linear Regression
# - ElasticNet

# Copy DFs to not mess up original one
# We will use model_df for our regression model
model_df = all_cases.copy()

# Delete redundant rows
for row in ['abbrev', 'bedsPerThousand', 'hospitalized', 
'state', 'hospitalizedCumulative', 'dataQualityGrade', 'lastUpdateEt']:
    del model_df[row]

# Drop NaN values for hospitalizedCurrently
model_df = model_df.dropna(subset=['hospitalizedCurrently'])

# Drop Values with abnormal active-hospitalised ratios (outside Conf. Interval)
model_df['ratio_hospital'] = model_df['hospitalizedCurrently'] / model_df['active']
model_df = model_df[~(model_df['ratio_hospital'] >= model_df.ratio_hospital.quantile(0.99))]

#model_df = model_df[~(model_df['ratio_hospital'] <= model_df['ratio_hospital'].median())]
del model_df['ratio_hospital']

# Get peek of model to use
model_df.describe()