# COVID-19 prediction

## Import data and packages

In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [127]:
df = pd.read_csv('covid.csv')

In [128]:
df.head()

Unnamed: 0,state,date,critical_staffing_shortage_today_yes,critical_staffing_shortage_today_no,critical_staffing_shortage_today_not_reported,critical_staffing_shortage_anticipated_within_week_yes,critical_staffing_shortage_anticipated_within_week_no,critical_staffing_shortage_anticipated_within_week_not_reported,hospital_onset_covid,hospital_onset_covid_coverage,...,previous_day_admission_pediatric_covid_confirmed_5_11,previous_day_admission_pediatric_covid_confirmed_5_11_coverage,previous_day_admission_pediatric_covid_confirmed_unknown,previous_day_admission_pediatric_covid_confirmed_unknown_coverage,staffed_icu_pediatric_patients_confirmed_covid,staffed_icu_pediatric_patients_confirmed_covid_coverage,staffed_pediatric_icu_bed_occupancy,staffed_pediatric_icu_bed_occupancy_coverage,total_staffed_pediatric_icu_beds,total_staffed_pediatric_icu_beds_coverage
0,ME,2021/01/20,3,28,8,2,29,8,7.0,38,...,,0,,0,,0,47.0,38,54.0,38
1,SD,2021/01/20,2,60,3,2,60,3,34.0,62,...,,0,,0,,0,33.0,62,83.0,62
2,MS,2021/01/17,12,95,2,16,91,2,10.0,107,...,,0,,0,,0,78.0,107,180.0,107
3,CT,2021/01/13,4,35,1,4,35,1,31.0,39,...,,0,,0,,0,196.0,39,335.0,39
4,MT,2021/01/13,10,56,1,11,55,1,18.0,66,...,,0,,0,,0,12.0,21,40.0,21


## Initial EDA

In [129]:
# number of rows and columns
df.shape

(52445, 135)

### Lets filter out some features

Lets better understand the data types in our dataset

In [130]:
# how many columns per data type
df.dtypes.value_counts()

float64    77
int64      56
object      2
dtype: int64

What are our object columns exactly 

In [131]:
# print object columns
df.select_dtypes('object').columns


Index(['state', 'date'], dtype='object')

#### We can convert 'state' to string and 'date' to a time stamp

In [132]:
# convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# convert state column to string 
df['state'] = df['state'].astype(str)


#### Remove columns which have a majority of Nan

In [133]:
def nan_cols(df, pct):
    """function to return columns with more than pct % of missing values"""
    lst = df.columns[df.isnull().mean() > pct]
    return lst

In [134]:
# columns with more than 50% of missing values
nan50 = nan_cols(df, 0.5)

# drop columns with more than 50% of missing values
df.drop(nan50, axis=1, inplace=True)

#### Removing data related to influenza as we are working specifically on this data 

In [135]:
# how many column names with "influenza" in it
influenza = df.columns[df.columns.str.contains('influenza')].tolist()
influenza

['icu_patients_confirmed_influenza',
 'icu_patients_confirmed_influenza_coverage',
 'previous_day_admission_influenza_confirmed',
 'previous_day_admission_influenza_confirmed_coverage',
 'previous_day_deaths_covid_and_influenza',
 'previous_day_deaths_covid_and_influenza_coverage',
 'previous_day_deaths_influenza',
 'previous_day_deaths_influenza_coverage',
 'total_patients_hospitalized_confirmed_influenza',
 'total_patients_hospitalized_confirmed_influenza_and_covid',
 'total_patients_hospitalized_confirmed_influenza_and_covid_coverage',
 'total_patients_hospitalized_confirmed_influenza_coverage']

In [136]:
# drop influenza columns
df.drop(influenza, axis=1, inplace=True)

#### Remove the columns with "coverage"

In [137]:
# create a df wihtout columns that have "coverage" in it
df = df[df.columns[~df.columns.str.contains('coverage')]]


In [138]:
# print lal variables with correlation >0.8 but not equal to 1
corr = df.corr()
# pairs above 0.7
pairs = corr[corr > 0.95][corr != 1].stack().reset_index().sort_values(by=0, ascending=False)
pairs

Unnamed: 0,level_0,level_1,0
75,total_staffed_adult_icu_beds,adult_icu_bed_utilization_denominator,0.999916
155,adult_icu_bed_utilization_denominator,total_staffed_adult_icu_beds,0.999916
40,staffed_adult_icu_bed_occupancy,adult_icu_bed_utilization_numerator,0.999886
144,adult_icu_bed_utilization_numerator,staffed_adult_icu_bed_occupancy,0.999886
160,adult_icu_bed_utilization_denominator,adult_icu_bed_covid_utilization_denominator,0.999836
...,...,...,...
48,staffed_icu_adult_patients_confirmed_covid,percent_of_inpatients_with_covid_numerator,0.950948
98,percent_of_inpatients_with_covid_numerator,previous_day_admission_adult_covid_confirmed,0.950347
26,previous_day_admission_adult_covid_confirmed,percent_of_inpatients_with_covid_numerator,0.950347
53,total_adult_patients_hospitalized_confirmed_an...,staffed_icu_adult_patients_confirmed_and_suspe...,0.950032


In [139]:
# how many unique variables in pairs
pairs['level_0'].nunique()

33

Should we maybe do some sort of Holistic Regression or whatnot to reduce the number of variables ? or do it manually ? <br>
Do we really want to keep pediatrics ?

#### Remove "states" that are actually territories

In [140]:
# drop rows with number of points per state < 900
df = df.groupby('state').filter(lambda x: len(x) > 900)

#  how many states in df
df['state'].nunique()

53

In [141]:
# remove VI, PR and AS from df
df = df[~df['state'].isin(['VI', 'PR', 'AS'])]

#  how many states in df
df['state'].nunique()

51

In [142]:
df.shape

(50088, 68)

In [143]:
df.head()

Unnamed: 0,state,date,critical_staffing_shortage_today_yes,critical_staffing_shortage_today_no,critical_staffing_shortage_today_not_reported,critical_staffing_shortage_anticipated_within_week_yes,critical_staffing_shortage_anticipated_within_week_no,critical_staffing_shortage_anticipated_within_week_not_reported,hospital_onset_covid,inpatient_beds,...,on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses,on_hand_supply_therapeutic_b_bamlanivimab_courses,on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses,previous_week_therapeutic_a_casirivimab_imdevimab_courses_used,previous_week_therapeutic_b_bamlanivimab_courses_used,previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used,all_pediatric_inpatient_bed_occupied,all_pediatric_inpatient_beds,staffed_pediatric_icu_bed_occupancy,total_staffed_pediatric_icu_beds
0,ME,2021-01-20,3,28,8,2,29,8,7.0,3246.0,...,174.0,347.0,,1.0,54.0,,167.0,220.0,47.0,54.0
1,SD,2021-01-20,2,60,3,2,60,3,34.0,2857.0,...,519.0,914.0,,22.0,244.0,,297.0,363.0,33.0,83.0
2,MS,2021-01-17,12,95,2,16,91,2,10.0,8692.0,...,356.0,2668.0,,100.0,1514.0,,345.0,2453.0,78.0,180.0
3,CT,2021-01-13,4,35,1,4,35,1,31.0,8451.0,...,1625.0,3443.0,,10.0,194.0,,454.0,685.0,196.0,335.0
4,MT,2021-01-13,10,56,1,11,55,1,18.0,3220.0,...,1645.0,470.0,,130.0,947.0,,71.0,163.0,12.0,40.0


# Easter egg: gini impurity code for ML :)

In [118]:
#  gini impurity function with two probabilities
def gini(p1, p2):
    g = 1 - (p1**2 + p2**2)
    return g

In [123]:
p1 = 2/11
p2 = abs(1 - p1)

# gini impurity for p1 and p2 as a fraction 
g1 = gini(p1, p2)
g1

0.2975206611570249

In [124]:
p1 = 9/15
p2 = abs(1 - p1)
# gini impurity for p1 and p2 as a fraction 
g2 = gini(p1, p2)
g2

0.48

In [125]:
prop1 = 11/26
prop2 = abs(1 - prop1)

gt = prop1*g1 + prop2*g2
gt

0.40279720279720277