In [52]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Importing Libraries 

In [53]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import os

# 2. Reading Data Set

In [54]:
df = pd.read_csv("../input/heart-disease-uci/heart.csv")
df.head()

# 3. Dataset Exploration 

This section will explore the data set that has been imported.

**3.1 Target Distribution**

In [55]:
df.target.value_counts()

In [56]:
sns.countplot(x="target", data=df, palette="bwr")
plt.show()

In [57]:
countNotSick = len(df[df.target == 0])
countSick = len(df[df.target == 1])
print("Percentage of patients who are not sick: {:.2f}%".format((countNotSick / (len(df.target))*100)))
print("Percentage of patients who are sick: {:.2f}%".format((countSick / (len(df.target))*100)))

In [58]:
sns.countplot(x='sex', data=df, palette="mako_r")
plt.xlabel("Gender (0 = Female, 1= Male)")
plt.show()

In [59]:
countFemale = len(df[df.sex == 0])
countMale = len(df[df.sex == 1])
print("Female percentage: {:.2f}%".format((countFemale/ (len(df.sex))*100)))
print("Male percentage: {:.2f}%".format((countMale  / (len(df.sex))*100)))

In [60]:
df.groupby('target').mean()

In [61]:
pd.crosstab(df.age,df.target).plot(kind="bar",figsize=(20,6))
plt.title('Heart Disease Frequency based on Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig('heartDiseaseAndAges.png')
plt.show()

In [62]:
pd.crosstab(df.sex,df.target).plot(kind="bar",figsize=(15,6),color=['#20639B','#ED553B' ])
plt.title('Heart Disease Frequency based on Gender')
plt.xlabel('Sex (0 = Female, 1 = Male)')
plt.xticks(rotation=0)
plt.legend(["Not Sick", "Sick"])
plt.ylabel('Frequency')
plt.show()

In [63]:
plt.scatter(x=df.age[df.target==1], y=df.thalach[(df.target==1)], c="red")
plt.scatter(x=df.age[df.target==0], y=df.thalach[(df.target==0)], c="green")
plt.legend(["Sick", "Not Sick"])
plt.xlabel("Age")
plt.ylabel("Heart Rate Max")
plt.show()

In [64]:
pd.crosstab(df.slope,df.target).plot(kind="bar",figsize=(15,6),color=['#6C5B7B','#F8B195' ])
plt.title('Heart Disease Frequency based on Slope')
plt.xlabel('The Slope of The Peak Exercise ST Segment ')
plt.xticks(rotation = 0)
plt.ylabel('Frequency')
plt.show()

In [65]:
pd.crosstab(df.fbs,df.target).plot(kind="bar",figsize=(15,6),color=['#009999','#00FF00' ])
plt.title('Heart Disease Frequency According To FBS')
plt.xlabel('FBS > 120 mg/dl (1 = true; 0 = false)')
plt.xticks(rotation = 0)
plt.legend(["Not Sick", "Sick"])
plt.ylabel('Frequency Sick/Not Sick')
plt.show()

In [66]:
pd.crosstab(df.cp,df.target).plot(kind="bar",figsize=(15,6),color=['#0000CC','#FFFF99' ])
plt.title('Heart Disease Frequency According To Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.xticks(rotation = 0)
plt.ylabel('Frequency Sick/Not Sick')
plt.show()

In [67]:
a = pd.get_dummies(df['cp'], prefix = "cp")
b = pd.get_dummies(df['thal'], prefix = "thal")
c = pd.get_dummies(df['slope'], prefix = "slope")

frames = [df, a, b, c]
df = pd.concat(frames, axis = 1)
df.head()

In [68]:
df = df.drop(columns = ['cp', 'thal', 'slope'])
df.head()

# 4.Models

In [69]:
y = df.target.values
x_data = df.drop(['target'], axis = 1)

#Normalization
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data)).values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)

In [70]:
lr = LogisticRegression()
lr.fit(x_train,y_train)
print("Test Accuracy {:.2f}%".format(lr.score(x_test,y_test)*100))
probs = lr.predict_proba(x_test)
print(log_loss(y_test, probs))

In [71]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(x_train, y_train)
prediction = knn.predict(x_test)
print("{} NN Score: {:.2f}%".format(2, knn.score(x_test, y_test)*100))

In [72]:
from sklearn.metrics import log_loss
probs = knn.predict_proba(x_test)
print(log_loss(y_test, probs))

In [73]:
scoreList = []
for i in range(1,25):
    knn2 = KNeighborsClassifier(n_neighbors = i)
    knn2.fit(x_train, y_train)
    scoreList.append(knn2.score(x_test, y_test))
    
plt.plot(range(1,25), scoreList)
plt.xticks(np.arange(1,25,1))
plt.xlabel("K value")
plt.ylabel("Score")
plt.show()
print("KNN Score Max {:.2f}%".format((max(scoreList))*100))

In [74]:
from sklearn.svm import SVC

svm = SVC(random_state = 1)
svm.fit(x_train, y_train)

In [75]:
print("SVM ALgorithm Test Accuracy: {:.2f}%".format(svm.score(x_test,y_test)*100))

In [76]:
from sklearn.naive_bayes import GaussianNB
nbg = GaussianNB()
nbg.fit(x_train, y_train)
print("Accuracy of Gaussian Naive Bayes: {:.2f}%".format(nbg.score(x_test,y_test)*100))

In [77]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
print("Decision Tree Test Accuracy {:.2f}%".format(dtc.score(x_test, y_test)*100))

In [78]:
scoreListDT = []
for i in range(2,25):
    dtc2 = DecisionTreeClassifier(max_leaf_nodes=i)
    dtc2.fit(x_train, y_train)
    scoreListDT.append(dtc2.score(x_test, y_test))
    
plt.plot(range(2,25), scoreListDT)
plt.xticks(np.arange(2,25,1))
plt.xlabel("Leaf")
plt.ylabel("Score")
plt.show()
print("DT Score Max {:.2f}%".format((max(scoreListDT))*100))

In [79]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
rf.fit(x_train, y_train)
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(rf.score(x_test,y_test)*100))

In [80]:
scoreListRF = []
for i in range(2,25):
    rf2 = RandomForestClassifier(n_estimators = 1000, random_state = 1, max_leaf_nodes=i)
    rf2.fit(x_train, y_train)
    scoreListRF.append(rf2.score(x_test, y_test))
    
plt.plot(range(2,25), scoreListRF)
plt.xticks(np.arange(2,25,1))
plt.xlabel("Leaf")
plt.ylabel("Score")
plt.show()
print("RF Score Max {:.2f}%".format((max(scoreListRF))*100))

# 5.Accuracy and Metrics

In [81]:
compare = pd.DataFrame({'Model': ["Logistic Regression", "KNN", "SVM", "Gaussian NB", "Decision Tree", "Random Forest"], 
                        'Accuracy': [86.89, 88.52, 88.52, 86.89, 85.25, 90.16]})
compare.sort_values(by='Accuracy', ascending=False)