In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import random
from scipy import stats
from sklearn import neighbors
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso, LassoCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

  from pandas import MultiIndex, Int64Index


In [2]:
data = pd.read_csv("heart.csv")

In [3]:
data[data.duplicated()]
data.drop_duplicates(keep='first', inplace=True)

In [5]:
X=data.drop(columns=['age','trtbps','chol','fbs','restecg','slp','sex','caa','thalachh','output'])

In [6]:
X

Unnamed: 0,cp,exng,oldpeak,thall
0,3,0,2.3,1
1,2,0,3.5,2
2,1,0,1.4,2
3,1,0,0.8,2
4,0,1,0.6,2
...,...,...,...,...
298,0,1,0.2,3
299,3,0,1.2,3
300,0,0,3.4,3
301,0,1,1.2,3


In [7]:
y=data['output']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
# Transform the testing data using the scaler
X_test = scaler.transform(X_test)

In [10]:
# 1. Create a logistic regression classifier
logistic_regression = LogisticRegression()
# Train the classifier on the training data
logistic_regression.fit(X_train, y_train)
# Evaluate the classifier on the testing data
logistic_regression_score = logistic_regression.score(X_test, y_test)
# Print the accuracy score
print('Accuracy:', logistic_regression_score)

Accuracy: 0.819672131147541


In [11]:
#  2. Create a k-NN classifier with k=3
KNN_classifier = KNeighborsClassifier(n_neighbors=3)
# Train the classifier on the scaled training data
KNN_classifier.fit(X_train, y_train)
# Evaluate the classifier on the scaled testing data
KNN_classifier_score = KNN_classifier.score(X_test, y_test)
# Print the accuracy score
print('Accuracy:', KNN_classifier_score)

Accuracy: 0.8032786885245902


In [12]:
# 3. Create SVM Classifier
svm_classifier = SVC(kernel='linear')
# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)
# Evaluate the classifier on the testing data
svm_classifier_score = svm_classifier.score(X_test, y_test)
# Print the accuracy score
print('Accuracy:', svm_classifier_score)

Accuracy: 0.8524590163934426


In [13]:
# 4. Create Decision tree classifier
decision_classifier = DecisionTreeClassifier()
# Train the classifier on the training data
decision_classifier.fit(X_train, y_train)
# Evaluate the classifier on the testing data
decision_classifier_score = decision_classifier.score(X_test, y_test)
# Print the accuracy score
print('Accuracy:', decision_classifier_score)

Accuracy: 0.7540983606557377


In [14]:
# 5. Random forest Classifier
random_classifer = RandomForestClassifier(n_estimators=100)
# Train the classifier on the training data
random_classifer.fit(X_train, y_train)
# Evaluate the classifier on the testing data
random_classifer_score = random_classifer.score(X_test, y_test)
# Print the accuracy score
print('Accuracy:', random_classifer_score)

Accuracy: 0.7704918032786885


In [15]:
# 6. XGboost classifier
xgb_classifier = xgb.XGBClassifier()
# Train the classifier on the training data
xgb_classifier.fit(X_train, y_train)
# Evaluate the classifier on the testing data
xgb_classifier_score = xgb_classifier.score(X_test, y_test)
# Print the accuracy score
print('Accuracy:', xgb_classifier_score)



Accuracy: 0.7868852459016393


In [16]:
# 7. Naive bayes Classification
naive_classifier = GaussianNB()
# Train the classifier on the training data
naive_classifier.fit(X_train, y_train)
# Evaluate the classifier on the testing data
naive_classifier_score = naive_classifier.score(X_test, y_test)
# Print the accuracy score
print('Accuracy:', naive_classifier_score)

Accuracy: 0.8360655737704918


In [19]:
models = [logistic_regression, KNN_classifier, svm_classifier, decision_classifier,random_classifer,xgb_classifier,naive_classifier]
model_names = ['Logistic regression', 'KNN classifier', 'SVM Classifier','Decision Tree classifier','Random Forest Classifier','XGB Classifier','Naive Bayes Classifier']
accuracy = []
precision = []
recall = []
f1 = []
for model in models:
    y_pred = model.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred, average='macro'))
    recall.append(recall_score(y_test, y_pred, average='macro'))
    f1.append(f1_score(y_test, y_pred, average='macro'))
    
evaluation = {'Model': model_names, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}
df = pd.DataFrame(evaluation)
# Print DataFrame
print(df)

                      Model  Accuracy  Precision    Recall  F1 Score
0       Logistic regression  0.819672   0.819355  0.820043  0.819478
1            KNN classifier  0.803279   0.807359  0.806034  0.803226
2            SVM Classifier  0.852459   0.852814  0.851293  0.851822
3  Decision Tree classifier  0.754098   0.760349  0.757543  0.753834
4  Random Forest Classifier  0.770492   0.774351  0.773168  0.770430
5            XGB Classifier  0.786885   0.793573  0.790409  0.786656
6    Naive Bayes Classifier  0.836066   0.840368  0.838901  0.836022
