In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

heart = pd.read_csv(r"C:\Users\arnuv\Downloads\heart.csv")

# first ten rows of the data set
heart.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1
6,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
7,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0
8,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
9,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0


In [5]:
# missing values if there are any and what data type they are
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [6]:
# describing the data of the dataset since there are no missing values that need to be fixed
heart.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [7]:
# dataframe shape (num of rows and columns respectively)
heart.shape

(1025, 14)

In [8]:
# categorical columns
cat_columns = ["sex", "cp", "restecg", "exang", "slope", "ca", "thal"]

heart_encoded = pd.get_dummies(heart, columns= cat_columns, drop_first=True)

In [9]:
# dataframe shape after encoding
heart_encoded.shape

(1025, 23)

In [10]:
# encoded dataframe columns
heart_encoded.columns

Index(['age', 'trestbps', 'chol', 'fbs', 'thalach', 'oldpeak', 'target',
       'sex_1', 'cp_1', 'cp_2', 'cp_3', 'restecg_1', 'restecg_2', 'exang_1',
       'slope_1', 'slope_2', 'ca_1', 'ca_2', 'ca_3', 'ca_4', 'thal_1',
       'thal_2', 'thal_3'],
      dtype='object')

In [11]:
X = heart_encoded.drop('target', axis = 1)
y = heart_encoded['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 15)

In [12]:
# transforming testing data
contin_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

scaler = StandardScaler()
X_train[contin_cols] = scaler.fit_transform(X_train[contin_cols])
X_train[contin_cols] = scaler.transform(X_train[contin_cols])

# training data after being transformed and scaled
X_train.head(10)

Unnamed: 0,age,trestbps,chol,fbs,thalach,oldpeak,sex_1,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
516,-5.952284,-7.539629,-4.844691,0,-6.633134,0.045541,True,False,False,False,...,True,True,False,False,True,False,False,False,False,True
937,-5.903373,-7.506745,-4.794615,0,-6.582089,-1.724148,True,False,True,False,...,False,False,True,False,False,False,False,False,True,False
945,-6.001196,-7.572513,-4.849737,0,-6.574236,-0.323145,True,False,False,True,...,False,True,False,False,False,False,False,False,False,True
204,-5.878918,-7.487015,-4.816742,0,-6.593868,-1.724148,False,False,True,False,...,False,True,False,True,False,False,False,False,True,False
886,-5.940057,-7.572513,-4.823729,0,-6.617428,0.930385,True,False,False,False,...,True,True,False,True,False,False,False,False,False,True
304,-6.050107,-7.519899,-4.848573,0,-6.560493,-1.650411,False,False,True,False,...,False,True,False,False,False,False,False,False,True,False
857,-6.160157,-7.588954,-4.807037,0,-6.536933,-0.839304,True,False,False,False,...,False,True,False,False,False,False,False,False,True,False
901,-6.172385,-7.631703,-4.821788,0,-6.652766,-1.281726,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False
617,-6.184613,-7.552782,-4.805873,0,-6.572272,-1.724148,False,True,False,False,...,False,False,True,False,False,False,False,False,True,False
990,-6.001196,-7.572513,-4.833046,0,-6.542823,-1.134252,True,True,False,False,...,False,False,True,False,False,False,False,False,True,False


In [13]:
# Creating the Logistic Regression model and fitting it to the training data
logistic_model = LogisticRegression(solver='liblinear', random_state=1)

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)
y_prob = logistic_model.predict_proba(X_test)[:,1]

# for visual purposes
for row in y_pred:
    if row == 0:
        print("not diseased")
    else:
        print('diseased')

diseased
diseased
diseased
diseased
diseased
not diseased
not diseased
diseased
diseased
not diseased
diseased
not diseased
diseased
diseased
not diseased
diseased
not diseased
not diseased
not diseased
diseased
diseased
not diseased
diseased
diseased
not diseased
diseased
not diseased
diseased
diseased
not diseased
not diseased
diseased
not diseased
not diseased
diseased
not diseased
not diseased
diseased
diseased
diseased
not diseased
diseased
not diseased
not diseased
not diseased
diseased
diseased
diseased
not diseased
not diseased
diseased
not diseased
diseased
not diseased
not diseased
diseased
diseased
diseased
diseased
diseased
not diseased
not diseased
diseased
not diseased
not diseased
diseased
diseased
diseased
diseased
diseased
not diseased
diseased
diseased
not diseased
diseased
not diseased
diseased
diseased
diseased
not diseased
not diseased
diseased
not diseased
diseased
diseased
not diseased
diseased
not diseased
not diseased
not diseased
diseased
not diseased
diseased

In [14]:
# Model Evaluations
accuracy = accuracy_score(y_test, y_pred)
classification = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f"accuracy score: {accuracy:.4f}")
print("\n","-"*100)
print(f"\n{classification}")
print("\n","-"*100)
print(f"\n{cm}")

accuracy score: 0.7366

 ----------------------------------------------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.77      0.70      0.73       106
           1       0.71      0.78      0.74        99

    accuracy                           0.74       205
   macro avg       0.74      0.74      0.74       205
weighted avg       0.74      0.74      0.74       205


 ----------------------------------------------------------------------------------------------------

[[74 32]
 [22 77]]
