## Final

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import precision_recall_curve, auc, average_precision_score, f1_score

# train
df = pd.read_csv('/Users/brianoktavec/MSC550/Final/heartdisease_train-1.csv')
# test
df1 = pd.read_csv('/Users/brianoktavec/MSC550/Final/heartdisease_test-1.csv')

df.head()

Unnamed: 0,gender,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,50,0,0,0,0,0,0,254,133.0,76.0,,75,76,0
1,0,43,0,0,0,0,0,0,247,131.0,88.0,27.64,72,61,0
2,1,46,1,15,0,0,1,0,294,142.0,94.0,26.31,98,64,0
3,0,41,0,0,1,0,1,0,332,124.0,88.0,31.31,65,84,0
4,0,38,1,20,0,0,1,0,221,140.0,90.0,21.35,95,70,1


### Address missing values 

In [2]:
from sklearn.impute import SimpleImputer
df.replace("?", np.nan, inplace=True)
df["BMI"] = df["BMI"].astype(float)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df["BMI"] = imputer.fit_transform(df[["BMI"]])

df.head()

Unnamed: 0,gender,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,50,0,0,0,0,0,0,254,133.0,76.0,25.781443,75,76,0
1,0,43,0,0,0,0,0,0,247,131.0,88.0,27.64,72,61,0
2,1,46,1,15,0,0,1,0,294,142.0,94.0,26.31,98,64,0
3,0,41,0,0,1,0,1,0,332,124.0,88.0,31.31,65,84,0
4,0,38,1,20,0,0,1,0,221,140.0,90.0,21.35,95,70,1


#### Get base model and test results

In [3]:
x = df.iloc[:, :-1]
y = df.iloc[:, -1:]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state = 0)

model=sm.Logit(y_train, sm.add_constant(x_train))
lr = model.fit()
print(lr.summary())

x_test = sm.add_constant(x_test)
y_pred = lr.predict(x_test)

y_pred_labels = [1 if p >= 0.5 else 0 for p in y_pred]

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred_labels)
print('Accuracy:', accuracy)

Optimization terminated successfully.
         Current function value: 0.378457
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:             TenYearCHD   No. Observations:                 2916
Model:                          Logit   Df Residuals:                     2901
Method:                           MLE   Df Model:                           14
Date:                Fri, 28 Apr 2023   Pseudo R-squ.:                  0.1177
Time:                        17:54:49   Log-Likelihood:                -1103.6
converged:                       True   LL-Null:                       -1250.9
Covariance Type:            nonrobust   LLR p-value:                 1.618e-54
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -9.1960      0.788    -11.670      0.000     -10.740      -7.652
gender        

#### Test accuracy of base model

In [4]:
x_new = df1.drop(columns=['TenYearCHD'])
x_new = sm.add_constant(x_new) 
y_pred = lr.predict(x_new)

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_true = df1['TenYearCHD']
y_pred_class = np.round(y_pred) 

cm = confusion_matrix(y_true, y_pred_class)
cr = classification_report(y_true, y_pred_class)

accuracy = accuracy_score(y_true, y_pred_class)

print('Confusion Matrix:\n', cm)
print('\nClassification Report:\n', cr)
print('\nAccuracy:', accuracy)

print('\n')
from sklearn.metrics import precision_recall_curve, f1_score

precision, recall, _ = precision_recall_curve(y_true, y_pred)
f1 = f1_score(y_true, y_pred_class)

print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Confusion Matrix:
 [[8 0]
 [2 0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89         8
           1       0.00      0.00      0.00         2

    accuracy                           0.80        10
   macro avg       0.40      0.50      0.44        10
weighted avg       0.64      0.80      0.71        10


Accuracy: 0.8


Precision: [0.2        0.22222222 0.25       0.28571429 0.33333333 0.4
 0.5        0.33333333 0.5        1.         1.        ]
Recall: [1.  1.  1.  1.  1.  1.  1.  0.5 0.5 0.5 0. ]
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Address imbalance

#### Method #1 - SMOTE

In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X= df.iloc[:, :-1]
y = df.iloc[:, -1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

smote = SMOTE(random_state=0)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_res, y_train_res)

y_pred = lr.predict(X_test)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Test accuracy of method #1

In [6]:
y_pred_labels = [1 if p >= 0.5 else 0 for p in y_pred]

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred_labels)
print('Accuracy:', accuracy)

x_new = df1.drop(columns=['TenYearCHD'])
y_pred = lr.predict(x_new)

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_true = df1['TenYearCHD']
y_pred_class = np.round(y_pred) 

cm = confusion_matrix(y_true, y_pred_class)
cr = classification_report(y_true, y_pred_class)

accuracy = accuracy_score(y_true, y_pred_class)

print('Confusion Matrix:\n', cm)
print('\nClassification Report:\n', cr)
print('\nAccuracy:', accuracy)

from sklearn.metrics import precision_recall_curve, f1_score

precision, recall, _ = precision_recall_curve(y_true, y_pred)
f1 = f1_score(y_true, y_pred_class)

print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Accuracy: 0.6671232876712329
Confusion Matrix:
 [[5 3]
 [0 2]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.62      0.77         8
           1       0.40      1.00      0.57         2

    accuracy                           0.70        10
   macro avg       0.70      0.81      0.67        10
weighted avg       0.88      0.70      0.73        10


Accuracy: 0.7
Precision: [0.2 0.4 1. ]
Recall: [1. 1. 0.]
F1 Score: 0.5714285714285715


#### Method #2 Random Over Sampling

In [7]:
from imblearn.over_sampling import RandomOverSampler
x2 = df.drop(columns=['TenYearCHD'])
y2 = df['TenYearCHD']
ros = RandomOverSampler(sampling_strategy='minority')
x_resampled, y_resampled = ros.fit_resample(x2, y2)
df_new = pd.concat([pd.DataFrame(x_resampled, columns=x.columns), pd.DataFrame(y_resampled, columns=['TenYearCHD'])], axis=1)

x_new = df_new.iloc[:, :-1]
y_new = df_new.iloc[:, -1:]
x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.20, random_state = 0)

model2=sm.Logit(y_train, sm.add_constant(x_train))
lr2 = model2.fit()
print(lr2.summary())

Optimization terminated successfully.
         Current function value: 0.601304
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:             TenYearCHD   No. Observations:                 4945
Model:                          Logit   Df Residuals:                     4930
Method:                           MLE   Df Model:                           14
Date:                Fri, 28 Apr 2023   Pseudo R-squ.:                  0.1325
Time:                        17:54:49   Log-Likelihood:                -2973.4
converged:                       True   LL-Null:                       -3427.6
Covariance Type:            nonrobust   LLR p-value:                7.212e-185
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -6.4871      0.441    -14.703      0.000      -7.352      -5.622
gender        

#### Test accuracy with method #2

In [8]:
x_new = df1.drop(columns=['TenYearCHD'])
x_new = sm.add_constant(x_new) 
y_pred = lr2.predict(x_new)

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_true = df1['TenYearCHD']
y_pred_class = np.round(y_pred) 

cm = confusion_matrix(y_true, y_pred_class)
cr = classification_report(y_true, y_pred_class)

accuracy = accuracy_score(y_true, y_pred_class)

print('Confusion Matrix:\n', cm)
print('\nClassification Report:\n', cr)
print('\nAccuracy:', accuracy)

from sklearn.metrics import precision_recall_curve, f1_score

precision, recall, _ = precision_recall_curve(y_true, y_pred)
f1 = f1_score(y_true, y_pred_class)

print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Confusion Matrix:
 [[6 2]
 [0 2]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.75      0.86         8
           1       0.50      1.00      0.67         2

    accuracy                           0.80        10
   macro avg       0.75      0.88      0.76        10
weighted avg       0.90      0.80      0.82        10


Accuracy: 0.8
Precision: [0.2        0.22222222 0.25       0.28571429 0.33333333 0.4
 0.5        0.33333333 0.5        1.         1.        ]
Recall: [1.  1.  1.  1.  1.  1.  1.  0.5 0.5 0.5 0. ]
F1 Score: 0.6666666666666666
