# COVID-19 prediction

## Import data and packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('covid.csv')

In [3]:
df.head()

Unnamed: 0,state,date,critical_staffing_shortage_today_yes,critical_staffing_shortage_today_no,critical_staffing_shortage_today_not_reported,critical_staffing_shortage_anticipated_within_week_yes,critical_staffing_shortage_anticipated_within_week_no,critical_staffing_shortage_anticipated_within_week_not_reported,hospital_onset_covid,hospital_onset_covid_coverage,...,previous_day_admission_pediatric_covid_confirmed_5_11,previous_day_admission_pediatric_covid_confirmed_5_11_coverage,previous_day_admission_pediatric_covid_confirmed_unknown,previous_day_admission_pediatric_covid_confirmed_unknown_coverage,staffed_icu_pediatric_patients_confirmed_covid,staffed_icu_pediatric_patients_confirmed_covid_coverage,staffed_pediatric_icu_bed_occupancy,staffed_pediatric_icu_bed_occupancy_coverage,total_staffed_pediatric_icu_beds,total_staffed_pediatric_icu_beds_coverage
0,ME,2021/01/20,3,28,8,2,29,8,7.0,38,...,,0,,0,,0,47.0,38,54.0,38
1,SD,2021/01/20,2,60,3,2,60,3,34.0,62,...,,0,,0,,0,33.0,62,83.0,62
2,MS,2021/01/17,12,95,2,16,91,2,10.0,107,...,,0,,0,,0,78.0,107,180.0,107
3,CT,2021/01/13,4,35,1,4,35,1,31.0,39,...,,0,,0,,0,196.0,39,335.0,39
4,MT,2021/01/13,10,56,1,11,55,1,18.0,66,...,,0,,0,,0,12.0,21,40.0,21


## Initial EDA

In [4]:
# number of rows and columns
df.shape

(52445, 135)

### Lets filter out some features

Lets better understand the data types in our dataset

In [5]:
# how many columns per data type
df.dtypes.value_counts()

float64    77
int64      56
object      2
dtype: int64

What are our object columns exactly 

In [6]:
# print object columns
df.select_dtypes('object').columns


Index(['state', 'date'], dtype='object')

#### We can convert 'state' to string and 'date' to a time stamp

In [7]:
# convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# convert state column to string 
df['state'] = df['state'].astype(str)


#### Remove columns which have a majority of Nan

In [8]:
def nan_cols(df, pct):
    """function to return columns with more than pct % of missing values"""
    lst = df.columns[df.isnull().mean() > pct]
    return lst

In [9]:
# columns with more than 50% of missing values
nan50 = nan_cols(df, 0.5)

# drop columns with more than 50% of missing values
df.drop(nan50, axis=1, inplace=True)

#### Removing data related to influenza as we are working specifically on this data 

In [10]:
# how many column names with "influenza" in it
influenza = df.columns[df.columns.str.contains('influenza')].tolist()
influenza

['icu_patients_confirmed_influenza',
 'icu_patients_confirmed_influenza_coverage',
 'previous_day_admission_influenza_confirmed',
 'previous_day_admission_influenza_confirmed_coverage',
 'previous_day_deaths_covid_and_influenza',
 'previous_day_deaths_covid_and_influenza_coverage',
 'previous_day_deaths_influenza',
 'previous_day_deaths_influenza_coverage',
 'total_patients_hospitalized_confirmed_influenza',
 'total_patients_hospitalized_confirmed_influenza_and_covid',
 'total_patients_hospitalized_confirmed_influenza_and_covid_coverage',
 'total_patients_hospitalized_confirmed_influenza_coverage']

In [11]:
# drop influenza columns
df.drop(influenza, axis=1, inplace=True)

#### Remove the columns with "coverage"

In [12]:
# create a df wihtout columns that have "coverage" in it
df = df[df.columns[~df.columns.str.contains('coverage')]]


In [13]:
# print lal variables with correlation >0.8 but not equal to 1
corr = df.corr()
# pairs above 0.7
pairs = corr[corr > 0.95][corr != 1].stack().reset_index().sort_values(by=0, ascending=False)
pairs

Unnamed: 0,level_0,level_1,0
75,total_staffed_adult_icu_beds,adult_icu_bed_utilization_denominator,0.999916
155,adult_icu_bed_utilization_denominator,total_staffed_adult_icu_beds,0.999916
40,staffed_adult_icu_bed_occupancy,adult_icu_bed_utilization_numerator,0.999886
144,adult_icu_bed_utilization_numerator,staffed_adult_icu_bed_occupancy,0.999886
160,adult_icu_bed_utilization_denominator,adult_icu_bed_covid_utilization_denominator,0.999836
...,...,...,...
48,staffed_icu_adult_patients_confirmed_covid,percent_of_inpatients_with_covid_numerator,0.950948
98,percent_of_inpatients_with_covid_numerator,previous_day_admission_adult_covid_confirmed,0.950347
26,previous_day_admission_adult_covid_confirmed,percent_of_inpatients_with_covid_numerator,0.950347
53,total_adult_patients_hospitalized_confirmed_an...,staffed_icu_adult_patients_confirmed_and_suspe...,0.950032


In [14]:
# how many unique variables in pairs
pairs['level_0'].nunique()

33

Should we maybe do some sort of Holistic Regression or whatnot to reduce the number of variables ? or do it manually ? <br>
Do we really want to keep pediatrics ?

#### Remove "states" that are actually territories

In [15]:
# drop rows with number of points per state < 900
df = df.groupby('state').filter(lambda x: len(x) > 900)

#  how many states in df
df['state'].nunique()

53

In [16]:
# remove VI, PR and AS from df
df = df[~df['state'].isin(['VI', 'PR', 'AS'])]

#  how many states in df
df['state'].nunique()

51

In [17]:
df.shape

(50088, 68)

In [18]:
df.head()

Unnamed: 0,state,date,critical_staffing_shortage_today_yes,critical_staffing_shortage_today_no,critical_staffing_shortage_today_not_reported,critical_staffing_shortage_anticipated_within_week_yes,critical_staffing_shortage_anticipated_within_week_no,critical_staffing_shortage_anticipated_within_week_not_reported,hospital_onset_covid,inpatient_beds,...,on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses,on_hand_supply_therapeutic_b_bamlanivimab_courses,on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses,previous_week_therapeutic_a_casirivimab_imdevimab_courses_used,previous_week_therapeutic_b_bamlanivimab_courses_used,previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used,all_pediatric_inpatient_bed_occupied,all_pediatric_inpatient_beds,staffed_pediatric_icu_bed_occupancy,total_staffed_pediatric_icu_beds
0,ME,2021-01-20,3,28,8,2,29,8,7.0,3246.0,...,174.0,347.0,,1.0,54.0,,167.0,220.0,47.0,54.0
1,SD,2021-01-20,2,60,3,2,60,3,34.0,2857.0,...,519.0,914.0,,22.0,244.0,,297.0,363.0,33.0,83.0
2,MS,2021-01-17,12,95,2,16,91,2,10.0,8692.0,...,356.0,2668.0,,100.0,1514.0,,345.0,2453.0,78.0,180.0
3,CT,2021-01-13,4,35,1,4,35,1,31.0,8451.0,...,1625.0,3443.0,,10.0,194.0,,454.0,685.0,196.0,335.0
4,MT,2021-01-13,10,56,1,11,55,1,18.0,3220.0,...,1645.0,470.0,,130.0,947.0,,71.0,163.0,12.0,40.0


## Remove columns with still too many NaNs

In [22]:
# number of Nan values in each column as a dataframe
df.isna().sum().to_frame().sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0
on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses,19640
previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used,19623
on_hand_supply_therapeutic_b_bamlanivimab_courses,15321
previous_week_therapeutic_b_bamlanivimab_courses_used,15285
on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses,15088
previous_week_therapeutic_a_casirivimab_imdevimab_courses_used,15087
total_staffed_pediatric_icu_beds,7768
all_pediatric_inpatient_beds,7764
staffed_pediatric_icu_bed_occupancy,7756
all_pediatric_inpatient_bed_occupied,7752


We can remove all the columns with therapeutic as they have more than 15,000 NaN

In [24]:
# print columns with "therapeutic" in column name
therapeutic = df.columns[df.columns.str.contains('therapeutic')].tolist()
therapeutic

['on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses',
 'on_hand_supply_therapeutic_b_bamlanivimab_courses',
 'on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses',
 'previous_week_therapeutic_a_casirivimab_imdevimab_courses_used',
 'previous_week_therapeutic_b_bamlanivimab_courses_used',
 'previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used']

In [25]:
# drop columns with "therapeutic" in column name
df.drop(therapeutic, axis=1, inplace=True)

In [None]:
# output df to csv
df.to_csv('covid_clean.csv', index=False)

## Data imputation

In [26]:
# use KNN imputer to fill missing values
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)

In [28]:
# use KNN imputer to fill missing numeric values
df.iloc[:, 2:] = imputer.fit_transform(df.iloc[:, 2:])
# df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [32]:
#  number of Nan values in the df
df.isna().sum().sum()

0

In [None]:
# KNN imputer for string values
# df['state'] = df['state'].fillna(df['state'].mode()[0])

In [33]:
# output df to csv
df.to_csv('covid_clean_imputted.csv', index=False)

### Lets build a correlation plot of the variables in our model

In [35]:
corrM = df.corr()
corrM.style.background_gradient(cmap='coolwarm',axis=None).set_precision(2)

Unnamed: 0,critical_staffing_shortage_today_yes,critical_staffing_shortage_today_no,critical_staffing_shortage_today_not_reported,critical_staffing_shortage_anticipated_within_week_yes,critical_staffing_shortage_anticipated_within_week_no,critical_staffing_shortage_anticipated_within_week_not_reported,hospital_onset_covid,inpatient_beds,inpatient_beds_used,inpatient_beds_used_covid,previous_day_admission_adult_covid_confirmed,previous_day_admission_adult_covid_suspected,previous_day_admission_pediatric_covid_confirmed,previous_day_admission_pediatric_covid_suspected,staffed_adult_icu_bed_occupancy,staffed_icu_adult_patients_confirmed_and_suspected_covid,staffed_icu_adult_patients_confirmed_covid,total_adult_patients_hospitalized_confirmed_and_suspected_covid,total_adult_patients_hospitalized_confirmed_covid,total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,total_pediatric_patients_hospitalized_confirmed_covid,total_staffed_adult_icu_beds,inpatient_beds_utilization,inpatient_beds_utilization_numerator,inpatient_beds_utilization_denominator,percent_of_inpatients_with_covid,percent_of_inpatients_with_covid_numerator,percent_of_inpatients_with_covid_denominator,inpatient_bed_covid_utilization,inpatient_bed_covid_utilization_numerator,inpatient_bed_covid_utilization_denominator,adult_icu_bed_covid_utilization,adult_icu_bed_covid_utilization_numerator,adult_icu_bed_covid_utilization_denominator,adult_icu_bed_utilization,adult_icu_bed_utilization_numerator,adult_icu_bed_utilization_denominator,previous_day_admission_adult_covid_confirmed_18-19,previous_day_admission_adult_covid_confirmed_20-29,previous_day_admission_adult_covid_confirmed_30-39,previous_day_admission_adult_covid_confirmed_40-49,previous_day_admission_adult_covid_confirmed_50-59,previous_day_admission_adult_covid_confirmed_60-69,previous_day_admission_adult_covid_confirmed_70-79,previous_day_admission_adult_covid_confirmed_80+,previous_day_admission_adult_covid_confirmed_unknown,previous_day_admission_adult_covid_suspected_18-19,previous_day_admission_adult_covid_suspected_20-29,previous_day_admission_adult_covid_suspected_30-39,previous_day_admission_adult_covid_suspected_40-49,previous_day_admission_adult_covid_suspected_50-59,previous_day_admission_adult_covid_suspected_60-69,previous_day_admission_adult_covid_suspected_70-79,previous_day_admission_adult_covid_suspected_80+,previous_day_admission_adult_covid_suspected_unknown,deaths_covid,all_pediatric_inpatient_bed_occupied,all_pediatric_inpatient_beds,staffed_pediatric_icu_bed_occupancy,total_staffed_pediatric_icu_beds
critical_staffing_shortage_today_yes,1.0,0.69,-0.11,0.88,0.64,-0.07,0.17,0.48,0.49,0.55,0.54,0.53,0.29,0.29,0.56,0.63,0.64,0.58,0.57,0.32,0.37,0.51,0.17,0.49,0.48,0.25,0.55,0.49,0.31,0.55,0.48,0.35,0.63,0.51,0.29,0.55,0.51,0.03,0.48,0.5,0.56,0.57,0.56,0.52,0.45,0.23,0.31,0.5,0.5,0.56,0.56,0.55,0.53,0.5,0.15,0.29,0.32,0.37,0.44,0.46
critical_staffing_shortage_today_no,0.69,1.0,-0.11,0.68,0.94,-0.07,0.34,0.72,0.73,0.5,0.53,0.62,0.38,0.45,0.74,0.54,0.54,0.54,0.52,0.4,0.39,0.75,0.2,0.73,0.73,0.06,0.51,0.73,0.12,0.51,0.73,0.13,0.55,0.74,0.25,0.74,0.75,0.03,0.48,0.49,0.51,0.53,0.52,0.51,0.49,0.2,0.4,0.61,0.61,0.63,0.65,0.64,0.62,0.59,0.19,0.21,0.54,0.58,0.56,0.56
critical_staffing_shortage_today_not_reported,-0.11,-0.11,1.0,0.06,0.02,0.9,0.18,0.45,0.42,0.29,0.24,0.3,0.36,0.38,0.41,0.23,0.22,0.22,0.21,0.39,0.4,0.39,-0.02,0.42,0.44,-0.0,0.27,0.42,-0.0,0.27,0.44,-0.04,0.23,0.4,0.1,0.41,0.4,0.01,0.28,0.26,0.24,0.21,0.22,0.22,0.23,0.2,0.22,0.29,0.29,0.3,0.27,0.27,0.26,0.25,0.15,0.13,0.4,0.34,0.27,0.22
critical_staffing_shortage_anticipated_within_week_yes,0.88,0.68,0.06,1.0,0.73,-0.07,0.23,0.58,0.59,0.55,0.56,0.56,0.38,0.3,0.64,0.6,0.61,0.59,0.59,0.38,0.47,0.6,0.21,0.59,0.58,0.19,0.56,0.59,0.25,0.56,0.58,0.26,0.6,0.6,0.31,0.64,0.6,0.03,0.53,0.53,0.55,0.56,0.57,0.55,0.52,0.21,0.36,0.54,0.55,0.59,0.6,0.59,0.57,0.56,0.13,0.26,0.37,0.42,0.46,0.46
critical_staffing_shortage_anticipated_within_week_no,0.64,0.94,0.02,0.73,1.0,-0.09,0.33,0.74,0.75,0.49,0.52,0.63,0.39,0.44,0.76,0.52,0.51,0.53,0.51,0.41,0.42,0.76,0.23,0.75,0.75,0.02,0.5,0.75,0.08,0.5,0.75,0.08,0.52,0.76,0.26,0.76,0.76,0.03,0.48,0.49,0.5,0.51,0.51,0.5,0.49,0.19,0.41,0.61,0.61,0.63,0.65,0.64,0.62,0.6,0.17,0.19,0.56,0.59,0.55,0.55
critical_staffing_shortage_anticipated_within_week_not_reported,-0.07,-0.07,0.9,-0.07,-0.09,1.0,0.18,0.42,0.39,0.3,0.24,0.29,0.34,0.41,0.39,0.26,0.26,0.23,0.22,0.38,0.36,0.36,-0.08,0.38,0.41,0.05,0.28,0.38,0.04,0.28,0.41,0.03,0.26,0.37,0.07,0.39,0.37,0.01,0.27,0.25,0.25,0.22,0.22,0.21,0.2,0.23,0.21,0.29,0.28,0.3,0.26,0.26,0.24,0.22,0.18,0.16,0.39,0.33,0.28,0.23
hospital_onset_covid,0.17,0.34,0.18,0.23,0.33,0.18,1.0,0.52,0.53,0.54,0.52,0.38,0.49,0.33,0.5,0.43,0.43,0.53,0.52,0.5,0.5,0.49,0.18,0.53,0.52,0.22,0.54,0.53,0.28,0.54,0.52,0.17,0.43,0.49,0.15,0.5,0.49,0.02,0.48,0.47,0.43,0.46,0.5,0.53,0.57,0.26,0.23,0.36,0.37,0.36,0.37,0.36,0.36,0.35,0.23,0.22,0.39,0.37,0.3,0.27
inpatient_beds,0.48,0.72,0.45,0.58,0.74,0.42,0.52,1.0,0.99,0.71,0.68,0.78,0.6,0.59,0.97,0.67,0.66,0.7,0.67,0.65,0.62,0.98,0.23,0.99,1.0,0.09,0.71,0.99,0.15,0.72,1.0,0.08,0.67,0.98,0.24,0.97,0.98,0.03,0.65,0.66,0.65,0.66,0.66,0.65,0.66,0.33,0.5,0.74,0.76,0.78,0.79,0.77,0.76,0.74,0.29,0.33,0.79,0.79,0.74,0.71
inpatient_beds_used,0.49,0.73,0.42,0.59,0.75,0.39,0.53,0.99,1.0,0.72,0.7,0.78,0.63,0.6,0.98,0.68,0.67,0.72,0.69,0.65,0.64,0.98,0.29,1.0,0.99,0.09,0.73,1.0,0.16,0.73,0.99,0.09,0.68,0.98,0.27,0.98,0.98,0.03,0.67,0.68,0.67,0.68,0.68,0.67,0.68,0.31,0.5,0.75,0.77,0.78,0.79,0.77,0.76,0.74,0.27,0.32,0.81,0.8,0.76,0.72
inpatient_beds_used_covid,0.55,0.5,0.29,0.55,0.49,0.3,0.54,0.71,0.72,1.0,0.93,0.7,0.69,0.5,0.76,0.93,0.94,0.97,0.96,0.72,0.77,0.71,0.21,0.72,0.71,0.48,1.0,0.72,0.57,1.0,0.71,0.46,0.92,0.71,0.3,0.76,0.71,0.04,0.84,0.87,0.89,0.92,0.93,0.91,0.88,0.43,0.4,0.63,0.65,0.69,0.7,0.68,0.67,0.66,0.28,0.52,0.52,0.54,0.54,0.53


In [36]:
# Fill diagonal and upper half with NaNs
mask = np.zeros_like(corrM, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corrM[mask] = np.nan
(corrM
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(null_color='#f1f1f1')  # Color NaNs grey
 .set_precision(2))

Unnamed: 0,critical_staffing_shortage_today_yes,critical_staffing_shortage_today_no,critical_staffing_shortage_today_not_reported,critical_staffing_shortage_anticipated_within_week_yes,critical_staffing_shortage_anticipated_within_week_no,critical_staffing_shortage_anticipated_within_week_not_reported,hospital_onset_covid,inpatient_beds,inpatient_beds_used,inpatient_beds_used_covid,previous_day_admission_adult_covid_confirmed,previous_day_admission_adult_covid_suspected,previous_day_admission_pediatric_covid_confirmed,previous_day_admission_pediatric_covid_suspected,staffed_adult_icu_bed_occupancy,staffed_icu_adult_patients_confirmed_and_suspected_covid,staffed_icu_adult_patients_confirmed_covid,total_adult_patients_hospitalized_confirmed_and_suspected_covid,total_adult_patients_hospitalized_confirmed_covid,total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,total_pediatric_patients_hospitalized_confirmed_covid,total_staffed_adult_icu_beds,inpatient_beds_utilization,inpatient_beds_utilization_numerator,inpatient_beds_utilization_denominator,percent_of_inpatients_with_covid,percent_of_inpatients_with_covid_numerator,percent_of_inpatients_with_covid_denominator,inpatient_bed_covid_utilization,inpatient_bed_covid_utilization_numerator,inpatient_bed_covid_utilization_denominator,adult_icu_bed_covid_utilization,adult_icu_bed_covid_utilization_numerator,adult_icu_bed_covid_utilization_denominator,adult_icu_bed_utilization,adult_icu_bed_utilization_numerator,adult_icu_bed_utilization_denominator,previous_day_admission_adult_covid_confirmed_18-19,previous_day_admission_adult_covid_confirmed_20-29,previous_day_admission_adult_covid_confirmed_30-39,previous_day_admission_adult_covid_confirmed_40-49,previous_day_admission_adult_covid_confirmed_50-59,previous_day_admission_adult_covid_confirmed_60-69,previous_day_admission_adult_covid_confirmed_70-79,previous_day_admission_adult_covid_confirmed_80+,previous_day_admission_adult_covid_confirmed_unknown,previous_day_admission_adult_covid_suspected_18-19,previous_day_admission_adult_covid_suspected_20-29,previous_day_admission_adult_covid_suspected_30-39,previous_day_admission_adult_covid_suspected_40-49,previous_day_admission_adult_covid_suspected_50-59,previous_day_admission_adult_covid_suspected_60-69,previous_day_admission_adult_covid_suspected_70-79,previous_day_admission_adult_covid_suspected_80+,previous_day_admission_adult_covid_suspected_unknown,deaths_covid,all_pediatric_inpatient_bed_occupied,all_pediatric_inpatient_beds,staffed_pediatric_icu_bed_occupancy,total_staffed_pediatric_icu_beds
critical_staffing_shortage_today_yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
critical_staffing_shortage_today_no,0.69,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
critical_staffing_shortage_today_not_reported,-0.11,-0.11,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
critical_staffing_shortage_anticipated_within_week_yes,0.88,0.68,0.06,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
critical_staffing_shortage_anticipated_within_week_no,0.64,0.94,0.02,0.73,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
critical_staffing_shortage_anticipated_within_week_not_reported,-0.07,-0.07,0.9,-0.07,-0.09,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
hospital_onset_covid,0.17,0.34,0.18,0.23,0.33,0.18,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
inpatient_beds,0.48,0.72,0.45,0.58,0.74,0.42,0.52,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
inpatient_beds_used,0.49,0.73,0.42,0.59,0.75,0.39,0.53,0.99,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
inpatient_beds_used_covid,0.55,0.5,0.29,0.55,0.49,0.3,0.54,0.71,0.72,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Easter egg: gini impurity code for ML :)

In [118]:
#  gini impurity function with two probabilities
def gini(p1, p2):
    g = 1 - (p1**2 + p2**2)
    return g

In [123]:
p1 = 2/11
p2 = abs(1 - p1)

# gini impurity for p1 and p2 as a fraction 
g1 = gini(p1, p2)
g1

0.2975206611570249

In [124]:
p1 = 9/15
p2 = abs(1 - p1)
# gini impurity for p1 and p2 as a fraction 
g2 = gini(p1, p2)
g2

0.48

In [125]:
prop1 = 11/26
prop2 = abs(1 - prop1)

gt = prop1*g1 + prop2*g2
gt

0.40279720279720277