In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [2]:
file_prepend = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed."

In [3]:
cities = ["cleveland.data", "hungarian.data", "switzerland.data", "va.data"]

In [4]:
output = pd.DataFrame()
for city in cities:
    loaded = pd.read_csv(file_prepend + city, names= ["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal","num"]).reset_index()
    output = pd.concat([output, loaded])

In [5]:
output.head(5)

Unnamed: 0,index,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,0,63.0,1.0,1.0,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,1,67.0,1.0,4.0,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,2,67.0,1.0,4.0,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,3,37.0,1.0,3.0,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,4,41.0,0.0,2.0,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [7]:
output = output.replace("?", np.nan)

In [10]:
output = output.dropna(axis=0)

In [11]:
numitems = len(output)
percenttrain = 0.85
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print ('Training set', numtrain, 'items')
print ('Test set', numtest, 'items')
outputTrain = output[0:numtrain]
outputTest = output[numtrain:]

Training set 254 items
Test set 45 items


In [12]:
features = list(output.keys())[:-1]

In [13]:
outputTrain

Unnamed: 0,index,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,0,63.0,1.0,1.0,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,1,67.0,1.0,4.0,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,2,67.0,1.0,4.0,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,3,37.0,1.0,3.0,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,4,41.0,0.0,2.0,130,204,0,2,172,0,1.4,1,0.0,3.0,0
5,5,56.0,1.0,2.0,120,236,0,0,178,0,0.8,1,0.0,3.0,0
6,6,62.0,0.0,4.0,140,268,0,2,160,0,3.6,3,2.0,3.0,3
7,7,57.0,0.0,4.0,120,354,0,0,163,1,0.6,1,0.0,3.0,0
8,8,63.0,1.0,4.0,130,254,0,2,147,0,1.4,2,1.0,7.0,2
9,9,53.0,1.0,4.0,140,203,1,2,155,1,3.1,3,0.0,7.0,1


In [16]:
neighbors = 8
classifier = KNeighborsClassifier(neighbors)
classifier.fit(outputTrain[features], outputTrain['num'])
predictions = classifier.predict(outputTest[features])


# Calculate accuracy
prediccionesCorrectas=[predictions==outputTest['num'].values]
accuracy= np.sum(prediccionesCorrectas) / len(predictions)
print('Accuracy: ', accuracy)

Accuracy:  0.4


In [17]:
trees = 100
rf = RandomForestClassifier(n_estimators=trees, class_weight="balanced")
rf.fit(outputTrain[features],outputTrain['num'])
predictions = rf.predict(outputTest[features])

# Calculate accuracy
prediccionesCorrectas=[predictions==outputTest['num'].values]
accuracy= np.sum(prediccionesCorrectas) / len(predictions)
print('Accuracy: ', accuracy)

Accuracy:  0.4666666666666667


In [18]:
from sklearn.metrics import confusion_matrix

In [19]:
confusion_matrix(outputTest['num'],predictions)

array([[18,  0,  0,  0,  0],
       [ 9,  1,  0,  2,  0],
       [ 2,  1,  2,  4,  0],
       [ 2,  2,  0,  0,  0],
       [ 0,  0,  1,  1,  0]])

In [84]:
relevant_features = [features[i] for i, value in enumerate(rf.feature_importances_ ) if rf.feature_importances_[i]>0.05]

In [88]:
trees = 100
rf = RandomForestClassifier(n_estimators=trees)
rf.fit(outputTrain[relevant_features],outputTrain['num'])
predictions = rf.predict(outputTest[relevant_features])

# Calculate accuracy
prediccionesCorrectas=[predictions==outputTest['num'].values]
accuracy= np.sum(prediccionesCorrectas) / len(predictions)
print('Accuracy: ', accuracy)

Accuracy:  0.32608695652173914
