In [1]:
# Importing libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score

# Theoretical Questions

1. (d) Bootstrapping
2. (b) low bias and high variance
3. (c) high bias and low variance
4. Regularization is a technique that is used to improve the way a statistical model is fitted to a set of data. It is useful because it can prevent overfitting of the model and make more accurate predictions. 

# Exercise 5

In [2]:
# (a) (4 points) Using the pandas library, read the csv data file and create a data-frame called heart.
heart=pd.read_csv('framingham.csv')
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
# (b) (3 points) Remove observations with missing values.
heart=heart.dropna()

In [4]:
# (c) (25 points) Perform a 5-folds cross validation with the goal of measuring the performance,
#     in terms of F1-score, of two competing models:
    # • Using age, currentSmoker, totChol, sysBP, diaBP, BMI, heartRate, and glucose as
    #   the predictor variables, and TenYearCHD as the target variable build a logistic regression
    #   model under the 5-folds cross validation framework. Compute and store the F1-score
    #   for each iteration.
    # • Using age, currentSmoker, totChol, BMI, heartRate, and glucose as the predictor
    #   variables, and TenYearCHD as the target variable build a logistic regression model under the 5-folds cross validation framework. Compute and store the F1-score for each
    #   iteration.
        # Use 25% as threshold to change the likelihoods to labels. Make sure to scale the input
        # variables of both models to 0-1 range (see MinMaxScaler) before you run the 5-fold cross
        # validation framework. Also, you can use the f1 score function to compute the F1-score.

# Train test split
X=heart[['age', 'currentSmoker', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']]
Y=heart['TenYearCHD']

# Creating scaler
scaler=MinMaxScaler()

# Defining lists to store the MSE
model_1_f1_scores,model_2_f1_scores=[],[]

kfold=KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in kfold.split(X):
    X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)
    
    ###########
    # Model 1 #
    ###########
    
    # Scaling the training data
    X_train_scaled_1=scaler.fit_transform(X_train)
    # Building the model
    logit_md_1=LogisticRegression().fit(X_train_scaled_1, Y_train)
    # Scaling the testing data
    X_test_scaled_1=scaler.transform(X_test)
    # Predicting on the testing data
    logit_pred_1=logit_md_1.predict_proba(X_test_scaled_1)[:, 1]
    # Changing likelihood to labels
    logit_label_1 = np.where(logit_pred_1<0.25,0,1)
    # Calculating the f1 score
    f1_1=f1_score(Y_test, logit_label_1)
    # Appending f1 score to list
    model_1_f1_scores.append(f1_1)

    ###########
    # Model 2 #
    ###########

    # Dropping columns from training data for model 2
    X_train_2=X_train.drop(columns=['sysBP', 'diaBP'], axis=1)
    # Scaling the training data
    X_train_scaled_2=scaler.fit_transform(X_train_2)
    # Building the model
    logit_md_2=LogisticRegression().fit(X_train_scaled_2, Y_train)
    # Dropping columns from testing data for model 2
    X_test_2=X_test.drop(columns=['sysBP', 'diaBP'], axis=1)   
    # Scaling the testing data
    X_test_scaled_2=scaler.transform(X_test_2)
    # Predicting on the testing data
    logit_pred_2=logit_md_2.predict_proba(X_test_scaled_2)[:, 1]
    # Changing likelihood to labels
    logit_label_2 = np.where(logit_pred_2<0.25,0,1)
    # Calculating the f1 score
    f1_2=f1_score(Y_test, logit_label_2)
    # Appending f1 score to list
    model_2_f1_scores.append(f1_2)


# (d) (4 points) Report the average F1-score of each of the models. What model would you use
# to predict TenYearCHD? Explain

def Average(list):
    avg=np.average(list)
    return(avg)

model_1_avg_f1_score=Average(model_1_f1_scores)
model_2_avg_f1_score=Average(model_2_f1_scores)

print("The average f1 score for model 1 is ", model_1_avg_f1_score, "\nThe average f1 score for model 2 is ", model_2_avg_f1_score)

## Model 1 has a higher f1 score, which means its better at correctly identifying positive
## cases and minimizing false positives/false negatives. Therefore, I would use model 1.

The average f1 score for model 1 is  0.3491375244169693 
The average f1 score for model 2 is  0.3345260800021302
