In [1]:
# Importing dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
## import tensorflow as tf
import matplotlib.pyplot as plt

In [2]:
# Importing the input dataset
heart_df = pd.read_csv('Resources/heart_dummies.csv')
print(heart_df.shape)
heart_df.head()

(319795, 54)


Unnamed: 0,HeartDisease,PhysicalHealth,MentalHealth,SleepTime,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,Stroke_No,Stroke_Yes,...,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy),GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good
0,0,3,30,5,0,1,1,0,1,0,...,1,0,0,1,0,0,0,0,0,1
1,0,0,0,7,1,0,1,0,0,1,...,1,1,0,0,0,0,0,0,0,1
2,0,20,30,8,0,1,1,0,1,0,...,1,0,0,1,0,0,1,0,0,0
3,0,0,0,6,1,0,1,0,1,0,...,1,1,0,0,0,0,0,1,0,0
4,0,28,0,8,1,0,1,0,1,0,...,1,1,0,0,0,0,0,0,0,1


In [3]:
heart_df.columns

Index(['HeartDisease', 'PhysicalHealth', 'MentalHealth', 'SleepTime',
       'Smoking_No', 'Smoking_Yes', 'AlcoholDrinking_No',
       'AlcoholDrinking_Yes', 'Stroke_No', 'Stroke_Yes', 'DiffWalking_No',
       'DiffWalking_Yes', 'Sex_Female', 'Sex_Male', 'PhysicalActivity_No',
       'PhysicalActivity_Yes', 'Asthma_No', 'Asthma_Yes', 'KidneyDisease_No',
       'KidneyDisease_Yes', 'SkinCancer_No', 'SkinCancer_Yes',
       'BMI_Healthy_Weight', 'BMI_Obesity', 'BMI_Overweight',
       'BMI_Underweight', 'AgeCategory_18-24', 'AgeCategory_25-29',
       'AgeCategory_30-34', 'AgeCategory_35-39', 'AgeCategory_40-44',
       'AgeCategory_45-49', 'AgeCategory_50-54', 'AgeCategory_55-59',
       'AgeCategory_60-64', 'AgeCategory_65-69', 'AgeCategory_70-74',
       'AgeCategory_75-79', 'AgeCategory_80 or older',
       'Race_American Indian/Alaskan Native', 'Race_Asian', 'Race_Black',
       'Race_Hispanic', 'Race_Other', 'Race_White', 'Diabetic_No',
       'Diabetic_No, borderline diabetes', 'D

### Split our data into training and testing

In [4]:
# Removing HeartDisease target from features data
y = heart_df.loc[:,"HeartDisease"]
X = heart_df.drop(columns="HeartDisease")

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [5]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(239846, 53)
(79949, 53)
(239846,)
(79949,)


In [6]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

### Create a Logistic Regression Model

In [7]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model

LogisticRegression(random_state=1)

### Fit (train) or model using the training data

In [8]:
# Train the data
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

### Make predictions

In [9]:
# Predict outcomes for test data set
predictions = model.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,1
7,0,0
8,0,0
9,0,0


### Validate the model using the test data

In [10]:
accuracy_score(y_test, predictions)

0.9160339716569313

In [11]:
# Calculated the balanced accuracy score
acc_score = accuracy_score(y_test, predictions)

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[72465,   641],
       [ 6072,   771]], dtype=int64)

In [13]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual True", "Actual False"],
    columns=["Predicted True", "Predicted False"])

In [14]:
# Displaying results
print("Confusion Matrix")
display(cm_df)

print(f"Accuracy Score : {acc_score}")

# Print the imbalanced classification report
print("Classification Report Imbalanced")
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted True,Predicted False
Actual True,72465,641
Actual False,6072,771


Accuracy Score : 0.9160339716569313
Classification Report Imbalanced
                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.99      0.11      0.96      0.33      0.12     73106
          1       0.55      0.11      0.99      0.19      0.33      0.10      6843

avg / total       0.89      0.92      0.19      0.89      0.33      0.12     79949



### Random Oversampling

In [15]:
from collections import Counter
Counter(y_train)

Counter({0: 219316, 1: 20530})

In [16]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [17]:
# Counting the number of instances by class verifies that they are now equal in size
Counter(y_resampled)

Counter({0: 219316, 1: 219316})

#### Logistic Regression model

In [18]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [19]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[54743, 18363],
       [ 1511,  5332]], dtype=int64)

In [20]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.764003600072584

In [21]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.75      0.78      0.85      0.76      0.58     73106
          1       0.23      0.78      0.75      0.35      0.76      0.59      6843

avg / total       0.91      0.75      0.78      0.80      0.76      0.58     79949



While precision and recall are high for the majority class, precision is low for the minority class.

### Synthetic Minority Oversampling Technique (SMOTE)

In [22]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

In [23]:
Counter(y_resampled)

Counter({0: 219316, 1: 219316})

#### Logistic Regression model

In [24]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

confusion_matrix(y_test, y_pred)

print(classification_report_imbalanced(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


                   pre       rec       spe        f1       geo       iba       sup

          0       0.95      0.89      0.45      0.92      0.63      0.42     73106
          1       0.28      0.45      0.89      0.34      0.63      0.38      6843

avg / total       0.89      0.85      0.48      0.87      0.63      0.41     79949



The metrics of the minority class (precision, recall, and F1 score) are slightly improved over those of random oversampling.

### Random Undersampling

In [25]:
import pandas as pd
from path import Path
from collections import Counter

heart_df.head()

Unnamed: 0,HeartDisease,PhysicalHealth,MentalHealth,SleepTime,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,Stroke_No,Stroke_Yes,...,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy),GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good
0,0,3,30,5,0,1,1,0,1,0,...,1,0,0,1,0,0,0,0,0,1
1,0,0,0,7,1,0,1,0,0,1,...,1,1,0,0,0,0,0,0,0,1
2,0,20,30,8,0,1,1,0,1,0,...,1,0,0,1,0,0,1,0,0,0
3,0,0,0,6,1,0,1,0,1,0,...,1,1,0,0,0,0,0,1,0,0
4,0,28,0,8,1,0,1,0,1,0,...,1,1,0,0,0,0,0,0,0,1


In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [27]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 20635, 1: 20635})

#### Logistic Regression model

In [28]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [29]:
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[54602, 18609],
       [ 1449,  5289]], dtype=int64)

In [30]:
balanced_accuracy_score(y_test, y_pred)

0.7653839547417276

In [31]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.75      0.78      0.84      0.77      0.58     73211
          1       0.22      0.78      0.75      0.35      0.77      0.59      6738

avg / total       0.91      0.75      0.78      0.80      0.77      0.58     79949



### Cluster Centroid Undersampling

In [32]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

In [33]:
Counter(y_resampled)

In [34]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [35]:
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

balanced_accuracy_score(y_test, y_pred)

print(classification_report_imbalanced(y_test, y_pred))

These results are worse than those from random undersampling

## Combination Sampling With SMOTEENN

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

#### Logistic Regression model

In [None]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
Counter(y_resampled)

In [None]:
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))