In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from scipy.special import expit
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [2]:
url = 'https://raw.githubusercontent.com/apd2599/Training_2021_CIRCUIT/main/Heart_Attack/heart.csv'
#df = pd.read_csv('heart.csv')
df = pd.read_csv(url)

Description of the columns in the dataset

Age : Age of the person

Sex : Gender of the person

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type

Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic

trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

Value 0: normal
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack

In [3]:
#look at the data
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
target = df['output']
df = df.drop('output', axis=1)

In [5]:
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [6]:
df['cp'] = df['cp'].map({0: 'no chest pain', 1: 'typical angina', 2: 'atypical angina', 3: 'non-anginal pain', 4: 'asymptomatic'})


In [7]:
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,63,1,non-anginal pain,145,233,1,0,150,0,2.3,0,0,1
1,37,1,atypical angina,130,250,0,1,187,0,3.5,0,0,2
2,41,0,typical angina,130,204,0,0,172,0,1.4,2,0,2
3,56,1,typical angina,120,236,0,1,178,0,0.8,2,0,2
4,57,0,no chest pain,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,no chest pain,140,241,0,1,123,1,0.2,1,0,3
299,45,1,non-anginal pain,110,264,0,1,132,0,1.2,1,0,3
300,68,1,no chest pain,144,193,1,1,141,0,3.4,1,2,3
301,57,1,no chest pain,130,131,0,1,115,1,1.2,1,1,3


In [8]:
pd.get_dummies(df)

Unnamed: 0,age,sex,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,cp_atypical angina,cp_no chest pain,cp_non-anginal pain,cp_typical angina
0,63,1,145,233,1,0,150,0,2.3,0,0,1,0,0,1,0
1,37,1,130,250,0,1,187,0,3.5,0,0,2,1,0,0,0
2,41,0,130,204,0,0,172,0,1.4,2,0,2,0,0,0,1
3,56,1,120,236,0,1,178,0,0.8,2,0,2,0,0,0,1
4,57,0,120,354,0,1,163,1,0.6,2,0,2,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,0,1,123,1,0.2,1,0,3,0,1,0,0
299,45,1,110,264,0,1,132,0,1.2,1,0,3,0,0,1,0
300,68,1,144,193,1,1,141,0,3.4,1,2,3,0,1,0,0
301,57,1,130,131,0,1,115,1,1.2,1,1,3,0,1,0,0


In [9]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df)
enc.categories_

[array([29, 34, 35, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 74, 76, 77]),
 array([0, 1]),
 array(['atypical angina', 'no chest pain', 'non-anginal pain',
        'typical angina'], dtype=object),
 array([ 94, 100, 101, 102, 104, 105, 106, 108, 110, 112, 114, 115, 117,
        118, 120, 122, 123, 124, 125, 126, 128, 129, 130, 132, 134, 135,
        136, 138, 140, 142, 144, 145, 146, 148, 150, 152, 154, 155, 156,
        160, 164, 165, 170, 172, 174, 178, 180, 192, 200]),
 array([126, 131, 141, 149, 157, 160, 164, 166, 167, 168, 169, 172, 174,
        175, 176, 177, 178, 180, 182, 183, 184, 185, 186, 187, 188, 192,
        193, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206, 207,
        208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
        221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
        234, 235, 236, 237, 239, 240, 241, 2

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.33, random_state=42)

In [11]:
def Log_Reg(X=X_train, y=y_train, X2=X_test, y2=y_test):
    clf = LogisticRegression().fit(X, y)
    print(clf.predict(X2))
    
    print(clf.score(X, y))
    print(classification_report(y2, clf.predict(X2)))

    #plt.figure(1, figsize=(4,3))
    #plt.clf()
    #plt.scatter(X, y, color='black', zorder=20)
    #loss = expit(X2 * clf.coef_ + clf.intercept_).ravel()
    #plt.plot(X2, loss, color='red', linewidth=3)

    logit_roc_auc = roc_auc_score(y_test, clf.predict(X_test))
    fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()

In [12]:
Log_Reg(X_train, y_train, X_test, y_test)

ValueError: could not convert string to float: 'typical angina'