In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

$$\text{IEOR 142 Final Project - Battling COVID-19 Falsehoods with Machine Learning Approaches
}$$
$$\text{Ahmet Turunc, Catherine Lei, Pranav Viswanathan, Vishnu Karukonda, Wako Morimoto
}$$

Data Sources 

Tweet and text data: https://github.com/MickeysClubhouse/COVID-19-rumor-dataset

COVID case and vaccination data by country: https://github.com/owid/covid-19-data/tree/master/public/data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')
import os
os.chdir("/content/gdrive/My Drive/IEOR 142 Final Project")

## Read in data
text_data: tweets and news articles with some additional values

case_data: COVID case data from OWID resource

In [None]:
text_data = pd.read_csv('en_dup.csv')
case_data = pd.read_csv('owid-covid-data.csv')

## Filtering data that has T/F labels, and a date

In [None]:
filter_data = text_data[text_data['label'].isnull()==False]
filter_data = filter_data[filter_data["label"] != "U"]
filter_data = filter_data[filter_data['time'].isnull()==False]
filter_data.index = np.arange(1, len(filter_data) + 1)
filter_data

["time"]  columns

###As seen below, time is stored in multiple different formats. We want time to match our "Case_data" dataset, so we want to reformat time to a yyyy-mm-dd format

In [None]:
filter_data["time"].tail(10)

In [None]:
def time_fixer(df):
  df = df.copy()

  #Making all yyyy/mm/dd into yyyy-mm-dd
  x = df["time"].str.replace("/","-")

  #Splitting into columns accesible by RangeIndex
  y = x.str.split(expand = True)
  y = y.fillna(value = 0)

  #Column 4 - Removing commas at the end of days + Replacing values in other formats with 0
  y[4] = y[4].replace("+0000", 0)
  y[4] = y[4].str.replace(',','')

  #Columns 3 - Remapping months, and removing values with :
  y[3] = np.where(y[3].str.contains(":", na=False), 0, y[3])
    #These are the only month values in y[4]
  m2d = {"Jan": "01", "Feb": "02", "Mar": "03", "Oct": "10"}
  y[3] = y[3].map(m2d)  

  #Column 0 - Removing values without - 
  y[0] = np.where(~y[0].str.contains("-", na=False), 0, y[0])

  #Column 1 - Remapping months, Replacing AM/PM with NaN
  m2d2 = {"Jan": "01", "Feb": "02", "Mar": "03", "Oct": "10"}
  y[1] = y[1].map(m2d2)

  #creating a list of nans or those already in the correct format
  y_arr = np.array(y[0])
  y_arr


  #creating a list that has the correct format for things in the AM/PM ... format and nans for all other values

  join_list = np.array([])
  for i in range(1,1776):
    if y[5][i] != 0:
      if pd.isnull(y[4][i]) == False:
        if pd.isnull(y[3][i]) == False:
          format = y[5][i] + "-" + y[3][i] + "-" + y[4][i]
          join_list = np.append(join_list, format)
    if len(join_list) != i:
      join_list = np.append(join_list, "none")

  #creating a list that has the correct format for the final format

  join_list_2 = np.array([])
  for i in range(1,1776):
    if y[5][i] != 0:
      if pd.isnull(y[1][i]) == False:
        if pd.isnull(y[2][i]) == False:
          format = y[5][i] + "-" + y[1][i] + "-" + y[2][i]
          join_list_2 = np.append(join_list_2, format)
    if len(join_list_2) != i:
      join_list_2 = np.append(join_list_2, "none")

  #making all lists have the same "null values"
  y_arr = y_arr.astype(str)
  y_arr = np.where(y_arr =="0", "none", y_arr)
  y_arr

  #merging the three lists to make a final list of the correct formats
  full_list = np.array([])
  for i in range(1775):
    if y_arr[i] != "none":
      full_list = np.append(full_list, y_arr[i] )
    elif join_list[i] != "none":
      full_list = np.append(full_list, join_list[i] )
    elif join_list_2[i] != "none":
      full_list = np.append(full_list, join_list_2[i] )
    else:
      full_list = np.append(full_list, "none")

  #Some we want month values to be 0x instead of x (03 instead of 3) -- same thing for days.
  final_list = np.array([])
  for i in full_list:
    if i != "none":
      z = i.split("-")
    if len(z[1]) == 1:
      z[1] = '0' + z[1]
    if len(z[2]) == 1:
      z[2] = '0' + z[2]
    combined = z[0] + "-" + z[1] + "-" + z[2]
    final_list = np.append(final_list, combined)
  return final_list


In [None]:
time_reform = time_fixer(filter_data)

In [None]:
#Do we have any "none" values? Nope!
time_reform[time_reform == "none"]


In [None]:
'''
Looking through y, we have 2 wierd data entries:

'2020·Sprinklr-03-03'
'202-01-22'

'''
np.where(time_reform == '2020·Sprinklr-03-03'), np.where(time_reform == '202-01-22')

In [None]:
filter_data.iloc[547], filter_data.iloc[887]

In [None]:
time_reform = np.where(time_reform == '2020·Sprinklr-03-03', "2020-03-03", time_reform)
#https://twitter.com/PMBreakingNews/status/1219687852486942721 found tweet to verify date
time_reform = np.where(time_reform == '202-01-22', "2020-01-22", time_reform)
filter_data["time"] = time_reform
filter_data

## NLP Text Cleaning

In [None]:
#FUNCTIONS FROM LABS
from string import punctuation

def remove_punctuation(document):

    punct2 = punctuation.replace('#','')
    no_punct = ''.join([character for character in document if character not in punct2])
    
    return no_punct

def remove_digit(document): 
    
    no_digit = ''.join([character for character in document if not character.isdigit()])
              
    return no_digit

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(document):
    
    words = [word for word in document if not word in stop_words]
    
    return words

from nltk.stem import PorterStemmer

porter = PorterStemmer()

def stemmer(document):
    
    stemmed_document = [porter.stem(word) for word in document]
    
    return stemmed_document

from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
#inspect raw data
{print(x) for x in filter_data.head()['content']}

Proposed text cleaning steps: remove t.co links, remove twitter @ mentions, COVID mentions, followed by typical NLP steps from lecture 

Typical NLP cleaning steps

1) convert all chars to lowercase

2) remove all punctuation

3) remove any numeric values

4) tokenize text entries

5) remove stopwords from tokenized text

6) Stem all words

7) Detokenize text and use countvectorizer to create document-term matrix

In [None]:
#Convert to lowercase, remove t.co media links and remove all twitter @ mentions
pre_cleaned_text = filter_data['content'].str.lower().str.replace('@[^ .]+', '',regex=True).str.replace('https://t.co/[A-Za-z0-9]+', '',regex=True)
{print(x) for x in pre_cleaned_text.head()}

In [None]:
#Typical steps as performed in lab 8b and the NLP HW
punctless = pre_cleaned_text.apply(remove_punctuation)
digitless = punctless.apply(remove_digit)
tokens = digitless.apply(word_tokenize)
stopless = tokens.apply(remove_stopwords)
only_stems = stopless.apply(stemmer)
concat_text = only_stems.apply(TreebankWordDetokenizer().detokenize)
ct_vec = CountVectorizer(min_df=0.01) #ARBITRARILY CHOSEN MIN_DF; COME BACK TO DECIDE ON A VALUE
sparse = ct_vec.fit_transform(concat_text)
text_features = pd.DataFrame(sparse.toarray(), columns=ct_vec.get_feature_names(), index=pre_cleaned_text.index)
text_features

In [None]:
#Create merged table with labels, date, and text_features
text_final = text_features.copy()
text_final['label'] = filter_data['label']
text_final['date']= filter_data['time']
text_final.head()

In [None]:
text_final['label']

##Filtering covid data set to get per capita features from different income groups

The case data groups several countries into income categories and contains covid statistics across those groups of countries; for the sake of convenience, these entries can be used from every date. 

In [None]:
#PREVIEWING CASE DATA
case_data.head(5)

In [None]:
#Getting all US features
US_cases = case_data[case_data['location'] == 'United States']

#features that maybe do not add much value (daily cumulative counts primarily)
features_to_drop = ['total_cases','total_cases_per_million','total_deaths_per_million','total_tests','total_tests_per_thousand','total_vaccinations','people_vaccinated','total_boosters',
                    'total_vaccinations_per_hundred','total_boosters_per_hundred','population_density','median_age','aged_65_older','aged_70_older','gdp_per_capita','extreme_poverty',
                    'cardiovasc_death_rate','diabetes_prevalence','female_smokers','male_smokers','handwashing_facilities','hospital_beds_per_thousand','life_expectancy','human_development_index',
                    'excess_mortality_cumulative_absolute','excess_mortality_cumulative','excess_mortality','excess_mortality_cumulative_per_million','stringency_index','population', 'tests_units']
filtered_US_features = US_cases.drop(features_to_drop, axis=1).drop(['iso_code', 'continent', 'location'],axis=1)

##Merge covid case data with text feature data set based on date

In [None]:
text_final.head()

In [None]:
filtered_US_features.head()

In [None]:
combined_data = text_final.merge(filtered_US_features, how = 'left', left_on = 'date', right_on = 'date').drop(['date', 'date'], axis = 1)
combined_data.tail()

#combined_data = combined_data.replace(0.000, np.nan)


#### We notice that between each column and its "smoothed" version, values of NaN are represented as 0. This is an issue, as seen when looking ath the new_deaths and new_deaths_smoothed column. Having 0 deaths over a 7 day period is far different from having an unreported amount of deaths for 7 consecutive days. We will use the smooth features... (explain after discussing)

In [None]:
#Dropping features that have a smooth column or a per_million/per thousand/per hundred col
filtered_US_features_smooth = filtered_US_features.drop(['new_cases','new_cases_smoothed','new_deaths', 
                                                         'new_cases_per_million','new_deaths_per_million',
                                                         'icu_patients','hosp_patients','weekly_icu_admissions',
                                                         'weekly_hosp_admissions','new_tests','new_tests_per_thousand',
                                                         'new_tests_smoothed','new_vaccinations','new_vaccinations_smoothed', 'new_people_vaccinated_smoothed'],axis=1)
filtered_US_features_smooth



In [None]:
#combined_data_smooth = text_final.merge(filtered_US_features, how = 'left', left_on = 'date', right_on = 'date').drop(['date', 'date'], axis = 1)
#combined_data_smooth = combined_data_smooth.replace(0.000,np.nan)
#combined_data_smooth = combined_data_smooth.fillna(0) Slightly raise

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

#IMPUTE IN MEAN VALUE FOR MISSING VAL
imputer = SimpleImputer()
imputed_data = imputer.fit_transform(combined_data.drop(['label'], axis=1))
#imputed_data = imputer.fit_transform(combined_data_smooth.drop(['label'], axis=1))
labels = combined_data['label'].str.replace('T','1').str.replace('F','0').astype(int)
#labels = combined_data_smooth['label'].str.replace('T','1').str.replace('F','0').astype(int)
imputed_data

**Readibility Scoring Tweets**

We want to see if there is a difference in the readibility level of text between True and False tweets. However, most common readibility scores, such as the use Flesch and Gunning-fog use sentence related features in their analysis. Since tweets, unlike news articles or headlines have no expectation of being written well, it may be hard to identify sentences. Hence, we will use the smog formula.

SMOG grading = 3 + √(polysyllable count)

In [None]:
pip install https://github.com/andreasvc/readability/tarball/master

In [None]:
import readability

In [None]:
all_grades = (filter_data["content"]).apply(readability.getmeasures)
T_grades = (filter_data[filter_data["label"] == "T"]["content"]).apply(readability.getmeasures)
F_grades = (filter_data[filter_data["label"] == "F"]["content"]).apply(readability.getmeasures)
all_grades.items

In [None]:
#To change the read_ratings, ['DaleChallIndex'] with a value in here https://pypi.org/project/readability/
read_ratings = [i['readability grades']['DaleChallIndex'] for i in all_grades]
T_read_ratings = [i['readability grades']['DaleChallIndex'] for i in T_grades]
F_read_ratings = [i['readability grades']['DaleChallIndex'] for i in F_grades]
read_ratings[0:5]

In [None]:
[np.mean(T_read_ratings),np.std(T_read_ratings)], [np.mean(F_read_ratings),np.std(F_read_ratings)]

In [None]:
import seaborn as sns

x = sns.distplot(T_read_ratings, label='T r u e')
sns.distplot(F_read_ratings, label='F a l s e')
plt.legend()
x.set_xlabel("Dale-Chall Score", fontsize = 10)

Both True and False tweets have a similar score. Both scores, according to the SMOG index conversion table represent that the average covid-related tweet, regardless of True or False label, needs only a 6th grade reading level needed to fully understand. 

https://readabilityformulas.com/smog-readability-formula.php

In [None]:
combined_data["Readability"] = read_ratings
combined_data

**Imputed Missing Data with Feature Means; Convert T/F labels to 1/0 respectively**

In [None]:
from sklearn.impute import SimpleImputer
#IMPUTE IN MEAN VALUE FOR MISSING VAL
imputer = SimpleImputer()
imputed_data = imputer.fit_transform(combined_data.drop(['label'], axis=1))
#imputed_data = imputer.fit_transform(combined_data_smooth.drop(['label'], axis=1))
labels = combined_data['label'].str.replace('T','1').str.replace('F','0').astype(int)
#labels = combined_data_smooth['label'].str.replace('T','1').str.replace('F','0').astype(int)
imputed_data

##Preliminary modeling

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import confusion_matrix, make_scorer, roc_auc_score


In [None]:
def calculate_metrics(y_true, y_pred):
  cmtx = confusion_matrix(y_true, y_pred)
  tn, fn, tp, fp = cmtx[0][0],cmtx[1][0],cmtx[1][1],cmtx[0][1]
  metrics = {}
  metrics['tpr'] = tp / (tp + fn)
  metrics['fpr'] = fp / (fp + tn)
  metrics['accuracy'] = np.mean(y_true==y_pred)
  return metrics

#Code from lab 4 (adapted)
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
def plot_roc(y_test, predict_probas, model_names):
  plt.figure(figsize=(12, 8))
  plt.title('ROC Curve', fontsize=18)
  plt.xlabel('FPR', fontsize=16)
  plt.ylabel('TPR', fontsize=16)
  plt.xlim([-0.01, 1.00])
  plt.ylim([-0.01, 1.01])
  for i,y_proba in enumerate(predict_probas):
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=3, label=model_names[i]+' (area = {:0.2f})'.format(roc_auc))
  plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--', label='Naive Baseline (area = 0.50)')
  plt.legend(loc='lower right', fontsize=14)
  #plt.show()

#Adapted from Lab 8b
def bootstrap_validation(test_data, test_label, model, metrics_list, sample=5000, random_state=66):
    n_sample = sample
    n_metrics = len(metrics_list)
    output_array=np.zeros([n_sample, n_metrics])
    output_array[:]=np.nan
    print(output_array.shape)
    for bs_iter in range(n_sample):
        bs_index = np.random.choice(test_data.index, len(test_data.index), replace=True)
        bs_data = test_data.loc[bs_index]
        bs_label = test_label.loc[bs_index]
        bs_predicted = model.predict(bs_data)
        bs_proba = model.predict_proba(bs_data)[:,1]
        for metrics_iter in range(n_metrics):
            metrics = metrics_list[metrics_iter]
            output_array[bs_iter, metrics_iter]=metrics(bs_label,bs_predicted,bs_proba)
    output_df = pd.DataFrame(output_array)
    return output_df

#custom written metric functions
def get_tpr(y_true, y_pred,y_proba):
  cmtx = confusion_matrix(y_true, y_pred)
  tn, fn, tp, fp = cmtx[0][0],cmtx[1][0],cmtx[1][1],cmtx[0][1]
  return tp / (tp + fn)
def get_fpr(y_true, y_pred,y_proba):
  cmtx = confusion_matrix(y_true, y_pred)
  tn, fn, tp, fp = cmtx[0][0],cmtx[1][0],cmtx[1][1],cmtx[0][1]
  return fp / (fp + tn)
def get_accuracy(y_true, y_pred,y_proba):
  return np.mean(y_true==y_pred)
def get_AUC(y_true, y_pred,y_proba):
  return roc_auc_score(y_true, y_proba)

  #FOR CROSS VAL
def cv_fpr(y_true, y_pred):
  cmtx = confusion_matrix(y_true, y_pred)
  tn, fn, tp, fp = cmtx[0][0],cmtx[1][0],cmtx[1][1],cmtx[0][1]
  return fp / (fp + tn)

In [None]:
#Construct 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(imputed_data, labels, test_size=0.2, random_state=142)

###RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(random_state=142)
parameters = {'max_features': np.arange(20,200,10)} #cv with ccp_alpha? 
cv = KFold(n_splits=5, random_state=142, shuffle=True) 
rfc = GridSearchCV(rfc, param_grid=parameters, scoring=make_scorer(cv_fpr, greater_is_better = False), cv=cv, verbose=0)
rfc.fit(X_train, y_train)

In [None]:
rfc_predict_proba = rfc.predict_proba(X_test)[:,1]
rfc_predictions = rfc.predict(X_test)
rfc.best_params_

In [None]:
#Code from lab 4
plot_roc(y_test, [rfc_predict_proba], ['rfc'])

In [None]:
rfc_performance = calculate_metrics(y_test, rfc_predictions)
print({f"{x}: {str(rfc_performance[x])}" for x in rfc_performance})

###Logistic Regression (regular, LASSO, Ridge, Elastic Net)

In [None]:
logistic_regression = LogisticRegression(random_state = 142, penalty = 'none', max_iter = 1000)
logistic_regression.fit(X_train, y_train)

lasso = LogisticRegression(random_state = 142, penalty = 'l1', solver = 'liblinear', max_iter = 1000)
lasso.fit(X_train, y_train)

ridge = LogisticRegression(random_state = 142, penalty = 'l2', solver = 'liblinear', max_iter = 1000)
ridge.fit(X_train, y_train)

#Elastic net: 50/50 mix of Lasso and Ridge regularization
elastic_net = LogisticRegression(random_state = 142, solver = 'saga', penalty = 'elasticnet', l1_ratio = 0.5, max_iter = 1000)
elastic_net.fit(X_train, y_train)

In [None]:
logreg_predict_proba = logistic_regression.predict_proba(X_test)[:,1]
logreg_predictions = logistic_regression.predict(X_test)
elastic_net_predict_proba = elastic_net.predict_proba(X_test)[:,1]
elastic_net_predictions = elastic_net.predict(X_test)
lasso_predict_proba = lasso.predict_proba(X_test)[:,1]
lasso_predictions = lasso.predict(X_test)
ridge_predict_proba = ridge.predict_proba(X_test)[:,1]
ridge_predictions = ridge.predict(X_test)

In [None]:
logreg_performance = calculate_metrics(y_test, logreg_predictions)
print({f"{x}: {str(logreg_performance[x])}" for x in logreg_performance})

In [None]:
elastic_net_performance = calculate_metrics(y_test, elastic_net_predictions)
print({f"{x}: {str(elastic_net_performance[x])}" for x in elastic_net_performance})

In [None]:
lasso_performance = calculate_metrics(y_test, lasso_predictions)
print({f"{x}: {str(lasso_performance[x])}" for x in lasso_performance})

In [None]:
ridge_performance = calculate_metrics(y_test, ridge_predictions)
print({f"{x}: {str(ridge_performance[x])}" for x in ridge_performance})

###LDA

In [None]:
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train, y_train)

In [None]:
LDA_predict_proba = LDA.predict_proba(X_test)[:,1]
LDA_predictions = LDA.predict(X_test)

In [None]:
plot_roc(y_test, [LDA_predict_proba], ['LDA'])

In [None]:
lda_performance = calculate_metrics(y_test, LDA_predictions)
print({f"{x}: {str(lda_performance[x])}" for x in lda_performance})

###GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(random_state=142)
parameters = {'n_estimators': np.arange(100,1000,180), 'learning_rate':np.arange(0.1, 0.5, 0.1)}
cv = KFold(n_splits=5, random_state=142, shuffle=True) 
gbc = GridSearchCV(gbc, param_grid=parameters, scoring=make_scorer(cv_fpr, greater_is_better = False), cv=cv, verbose=0)
gbc.fit(X_train, y_train)

In [None]:
gbc_predict_proba = gbc.predict_proba(X_test)[:,1]
gbc_predictions = gbc.predict(X_test)
gbc.best_params_

##Model Testing

list of relevant variables:

model variable names:
rfc, gbc, logistic_regression, lasso, ridge, elastic_net, LDA, 

predict_proba list:
rfc_predict_proba, gbc_predict_proba, LDA_predict_proba, logreg_predict_proba, elastic_net_predict_proba, lasso_predict_proba, ridge_predict_proba

predictions list:
gbc_predictions, LDA_predictions, logreg_predictions, elastic_net_predictions,lasso_predictions, ridge_predictions

In [None]:
plot_roc(y_test, [rfc_predict_proba, gbc_predict_proba, LDA_predict_proba, logreg_predict_proba, elastic_net_predict_proba, lasso_predict_proba, ridge_predict_proba],
         ['Random Forests', 'Gradient Boosting', 'Linear Discriminant Analysis (LDA)', 'Logistic Regression', 'Elastic Net', 'LASSO', 'Ridge'])

In [None]:
#Custom class for a voting classifier on our already trained models
#sklearn classifier requires unfit models, but we want to use already fit models, so a custom function was needed
class ensembler():
  def __init__(self, models):
    self.models = models
  def predict(self, X_test):
    predictions = pd.DataFrame()
    for i, model in enumerate(self.models):      
      predictions[i] = model.predict(X_test)
    return predictions.max(axis=1)
  def predict_proba(self, X_test):
    predictions = pd.DataFrame()
    for i, model in enumerate(self.models):      
      predictions[i] = model.predict_proba(X_test)[:,1]
    return predictions.mean(axis=1)

In [None]:
ensemble1 = ensembler([rfc, gbc, lasso, LDA])
ensemble1_proba = ensemble1.predict_proba(X_test)
ensemble1_predictions = ensemble1.predict(X_test)
plot_roc(y_test, [ensemble1_proba], ['ensemble of 4 best AUC models'])

In [None]:
validation = pd.concat([pd.DataFrame(X_test), pd.Series(y_test.reset_index()['label'])], axis = 1)
valx = validation.drop('label',axis=1)
valy = validation['label']
rfc_performance = bootstrap_validation(valx, valy, rfc, [get_fpr, get_tpr, get_accuracy, get_AUC], sample=5000, random_state=66)
lasso_performance = bootstrap_validation(valx, valy, lasso, [get_fpr, get_tpr, get_accuracy, get_AUC], sample=5000, random_state=66)
gbc_performance = bootstrap_validation(valx, valy, gbc, [get_fpr, get_tpr, get_accuracy, get_AUC], sample=5000, random_state=66)
LDA_performance = bootstrap_validation(valx, valy, LDA, [get_fpr, get_tpr, get_accuracy, get_AUC], sample=5000, random_state=66)
elastic_performance = bootstrap_validation(valx, valy, elastic_net, [get_fpr, get_tpr, get_accuracy, get_AUC], sample=5000, random_state=66)
logreg_performance = bootstrap_validation(valx, valy, logistic_regression, [get_fpr, get_tpr, get_accuracy, get_AUC], sample=5000, random_state=66)
ridge_performance = bootstrap_validation(valx, valy, ridge, [get_fpr, get_tpr, get_accuracy, get_AUC], sample=5000, random_state=66)

In [None]:
print(f'random forest {calculate_metrics(y_test, rfc_predictions)}')
print(f'lasso {calculate_metrics(y_test, lasso_predictions)}')
print(f'gradient boosting {calculate_metrics(y_test, gbc_predictions)}')
print(f'LDA {calculate_metrics(y_test, LDA_predictions)}')
print(f'elastic net {calculate_metrics(y_test, elastic_net_predictions)}')
print(f'logistic regression {calculate_metrics(y_test, logreg_predictions)}')
print(f'ridge regression {calculate_metrics(y_test, ridge_predictions)}')