In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import precision_recall_curve, auc, average_precision_score, f1_score

In [2]:
df = pd.read_csv('/Users/brianoktavec/MSC550/Final/heartdisease_train-1.csv')
df.head()

Unnamed: 0,gender,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,50,0,0,0,0,0,0,254,133.0,76.0,,75,76,0
1,0,43,0,0,0,0,0,0,247,131.0,88.0,27.64,72,61,0
2,1,46,1,15,0,0,1,0,294,142.0,94.0,26.31,98,64,0
3,0,41,0,0,1,0,1,0,332,124.0,88.0,31.31,65,84,0
4,0,38,1,20,0,0,1,0,221,140.0,90.0,21.35,95,70,1


In [3]:
from sklearn.impute import SimpleImputer

df.replace("?", np.nan, inplace=True)

df["BMI"] = df["BMI"].astype(float)

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

df["BMI"] = imputer.fit_transform(df[["BMI"]])

In [4]:
df.head()

Unnamed: 0,gender,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,50,0,0,0,0,0,0,254,133.0,76.0,25.781443,75,76,0
1,0,43,0,0,0,0,0,0,247,131.0,88.0,27.64,72,61,0
2,1,46,1,15,0,0,1,0,294,142.0,94.0,26.31,98,64,0
3,0,41,0,0,1,0,1,0,332,124.0,88.0,31.31,65,84,0
4,0,38,1,20,0,0,1,0,221,140.0,90.0,21.35,95,70,1


In [5]:
x = df.iloc[:, :-1]
y = df.iloc[:, -1:]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state = 0)

In [6]:
model=sm.Logit(y_train, sm.add_constant(x_train))
lr = model.fit()
print(lr.summary())

Optimization terminated successfully.
         Current function value: 0.378457
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:             TenYearCHD   No. Observations:                 2916
Model:                          Logit   Df Residuals:                     2901
Method:                           MLE   Df Model:                           14
Date:                Thu, 27 Apr 2023   Pseudo R-squ.:                  0.1177
Time:                        23:59:02   Log-Likelihood:                -1103.6
converged:                       True   LL-Null:                       -1250.9
Covariance Type:            nonrobust   LLR p-value:                 1.618e-54
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -9.1960      0.788    -11.670      0.000     -10.740      -7.652
gender        

In [7]:
p_pred = lr.predict(sm.add_constant(x_test))
y_pred = round(p_pred)
lr_precision, lr_recall, _ = precision_recall_curve(y_test, p_pred)
auc1 = auc(lr_recall, lr_precision)
print("AUC for precision-recall curve:", auc1)


AUC for precision-recall curve: 0.3373687248911785


In [8]:
from imblearn.over_sampling import RandomOverSampler

x = df.drop(columns=['TenYearCHD'])
y = df['TenYearCHD']

In [9]:
ros = RandomOverSampler(sampling_strategy='minority')

x_resampled, y_resampled = ros.fit_resample(x, y)

In [10]:
df_new = pd.concat([pd.DataFrame(x_resampled, columns=x.columns), pd.DataFrame(y_resampled, columns=['TenYearCHD'])], axis=1)

In [11]:
df_new.head()

Unnamed: 0,gender,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,50,0,0,0,0,0,0,254,133.0,76.0,25.781443,75,76,0
1,0,43,0,0,0,0,0,0,247,131.0,88.0,27.64,72,61,0
2,1,46,1,15,0,0,1,0,294,142.0,94.0,26.31,98,64,0
3,0,41,0,0,1,0,1,0,332,124.0,88.0,31.31,65,84,0
4,0,38,1,20,0,0,1,0,221,140.0,90.0,21.35,95,70,1


In [12]:
x = df_new.iloc[:, :-1]
y = df_new.iloc[:, -1:]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state = 0)

In [13]:
model2=sm.Logit(y_train, sm.add_constant(x_train))
lr2 = model2.fit()
print(lr2.summary())

Optimization terminated successfully.
         Current function value: 0.603233
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:             TenYearCHD   No. Observations:                 4945
Model:                          Logit   Df Residuals:                     4930
Method:                           MLE   Df Model:                           14
Date:                Thu, 27 Apr 2023   Pseudo R-squ.:                  0.1297
Time:                        23:59:02   Log-Likelihood:                -2983.0
converged:                       True   LL-Null:                       -3427.6
Covariance Type:            nonrobust   LLR p-value:                8.803e-181
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -6.4463      0.442    -14.579      0.000      -7.313      -5.580
gender        

In [14]:
p_pred = lr2.predict(sm.add_constant(x_test))
y_pred = round(p_pred)
lr_precision, lr_recall, _ = precision_recall_curve(y_test, p_pred)
auc2 = auc(lr_recall, lr_precision)
print("AUC for precision-recall curve:", auc2)


AUC for precision-recall curve: 0.7129050390327065


In [15]:
from imblearn.under_sampling import RandomUnderSampler
x = df.drop(columns=['TenYearCHD'])
y = df['TenYearCHD']
undersample = RandomUnderSampler(sampling_strategy='majority', random_state = 0)
x_train_under, y_train_under = undersample.fit_resample(x_train, y_train)

In [16]:
model3 = sm.Logit(y_train_under, sm.add_constant(x_train_under))
lr3 = model3.fit()
print(lr3.summary())

Optimization terminated successfully.
         Current function value: 0.603204
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:             TenYearCHD   No. Observations:                 4932
Model:                          Logit   Df Residuals:                     4917
Method:                           MLE   Df Model:                           14
Date:                Thu, 27 Apr 2023   Pseudo R-squ.:                  0.1298
Time:                        23:59:02   Log-Likelihood:                -2975.0
converged:                       True   LL-Null:                       -3418.6
Covariance Type:            nonrobust   LLR p-value:                2.385e-180
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -6.4729      0.443    -14.619      0.000      -7.341      -5.605
gender        

In [17]:
p_pred = lr3.predict(sm.add_constant(x_test))
y_pred = round(p_pred)
lr_precision, lr_recall, _ = precision_recall_curve(y_test, p_pred)
auc3 = auc(lr_recall, lr_precision)
print("AUC for precision-recall curve:", auc3)

AUC for precision-recall curve: 0.7124476302931475


In [18]:
from sklearn.linear_model import LogisticRegression

model4 = LogisticRegression(class_weight='balanced')

model4.fit(sm.add_constant(x_train_under), y_train_under)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
from sklearn.metrics import precision_recall_curve, auc

y_pred_prob = model4.predict_proba(sm.add_constant(x_test))[:,1]

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)

auc_score = auc(recall, precision)

print('AUC for precision-recall curve:', auc_score)


AttributeError: 'SMOTE' object has no attribute 'add_constant'

In [20]:
df1 = pd.read_csv('/Users/brianoktavec/MSC550/Final/heartdisease_test-1.csv')
df1.head()

Unnamed: 0,gender,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0,0,0,0,0,0,195,106.0,70,26.97,80,77,0
1,0,46,0,0,0,0,0,0,250,121.0,81,28.73,95,76,0
2,1,48,1,20,0,0,0,0,245,127.5,80,25.34,75,70,0
3,0,61,1,30,0,0,1,0,225,150.0,95,28.58,65,103,1
4,0,46,1,23,0,0,0,0,285,130.0,84,23.1,85,85,0


In [30]:
X_new = df1.drop(columns=["TenYearCHD"])
y_new = df1["TenYearCHD"]
y_pred = model3.predict(sm.add_constant(X_new))

# Evaluate the prediction performance
print("Accuracy: ", accuracy_score(y_new, y_pred))
print("Precision: ", precision_score(y_new, y_pred))
print("Recall: ", recall_score(y_new, y_pred))
print("F1 score: ", f1_score(y_new, y_pred))
print("ROC AUC score: ", roc_auc_score(y_new, y_pred))

ValueError: shapes (4932,15) and (10,15) not aligned: 15 (dim 1) != 10 (dim 0)

In [25]:
print(X_new.shape)

(10, 14)


In [28]:
result = model2.fit()

print(result.params)


Optimization terminated successfully.
         Current function value: 0.603233
         Iterations 6
const             -6.446300
gender             0.530143
age                0.064087
currentSmoker      0.165951
cigsPerDay         0.012812
BPMeds             0.366995
prevalentStroke    1.268481
prevalentHyp       0.255983
diabetes           0.133040
totChol            0.001567
sysBP              0.015135
diaBP             -0.006922
BMI                0.014345
heartRate         -0.002699
glucose            0.006379
dtype: float64


In [29]:
print(df1.shape)

(10, 15)


In [39]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X= df.iloc[:, :-1]
y = df.iloc[:, -1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

sm = SMOTE(random_state=0)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_res, y_train_res)

y_pred = lr.predict(X_test)

# convert predicted probabilities to binary labels
y_pred_labels = [1 if p >= 0.5 else 0 for p in y_pred]

# calculate accuracy score
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred_labels)
print('Accuracy:', accuracy)

df1 = pd.read_csv('/Users/brianoktavec/MSC550/Final/heartdisease_test-1.csv')
x_new = df1.drop(columns=['TenYearCHD'])
# x_new = sm.add_constant(x_new) # Add constant term to features
y_pred = lr.predict(x_new)

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_true = df1['TenYearCHD']
y_pred_class = np.round(y_pred) # Round probabilities to the nearest integer to get binary predictions

# Calculate confusion matrix and classification report
cm = confusion_matrix(y_true, y_pred_class)
cr = classification_report(y_true, y_pred_class)

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred_class)

print('Confusion Matrix:\n', cm)
print('\nClassification Report:\n', cr)
print('\nAccuracy:', accuracy)

from sklearn.metrics import precision_recall_curve, f1_score

precision, recall, _ = precision_recall_curve(y_true, y_pred)
f1 = f1_score(y_true, y_pred_class)

print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Accuracy: 0.6671232876712329
Confusion Matrix:
 [[5 3]
 [0 2]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.62      0.77         8
           1       0.40      1.00      0.57         2

    accuracy                           0.70        10
   macro avg       0.70      0.81      0.67        10
weighted avg       0.88      0.70      0.73        10


Accuracy: 0.7
Precision: [0.2 0.4 1. ]
Recall: [1. 1. 0.]
F1 Score: 0.5714285714285715


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
