### Importing all the packages needed for scikit-learn, and so on 

In [10]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

### Reading the heart dataset and assigning it to a Pandas DataFrame

In [11]:
df = pd.DataFrame()
df = pd.read_csv('https://raw.githubusercontent.com/caceresdaniel/DataScience/master/Heart_s.csv')

### Splitting the data set into testing and training sets with test size = 0.25 and random state = 4

In [12]:
features = ['Age','RestBP','Chol','RestECG','MaxHR','Oldpeak']

x = df[features]

y = df['AHD']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,random_state=4)

### Running K nearest neighbors on the data set

In [13]:
k = 3

knn = KNeighborsClassifier(n_neighbors=k)

knn.fit(x_train, y_train)
y_predict = knn.predict(x_test)

knnacc = accuracy_score(y_test, y_predict)

print(knnacc)

0.6447368421052632


### Running Decision tree with random state 5 on the data set

In [14]:
heartDT = DecisionTreeClassifier(random_state=5)

heartDT.fit(x_train, y_train)

dtPredict = heartDT.predict(x_test)

dtacc = accuracy_score(y_test, dtPredict)

print(dtacc)

0.618421052631579


### Running Logistic Regression on the data set

In [15]:


heartlogreg = LogisticRegression()

heartlogreg.fit(x_train, y_train)

logregPredict = heartlogreg.predict(x_test)

logregacc = accuracy_score(y_test, logregPredict)

print(logregacc)

0.7368421052631579


#### E) by looking at the generated accuracys we can see that Logistic Regression gives the best accuracy for this dataset. As for the worst accuracy KNN seems to give the least satisfying result.

### Doing OneHotEncoding sadly in a inneficient way because I was unable to figure out how to do it with the tools provided by scikitlearn, basically made arrays for every new column to be later added to the dataframe, for loop that goes through each row to see what value it is and appends to the new columns a binary represintation of the row data, in this case doing it for gender

In [16]:
ms = []
fs = []

for x in df['Gender']:
    if x == 'm':
        ms.append(1)
        fs.append(0)
    elif x == 'f':
        ms.append(0)
        fs.append(1)
        
ms = pd.Series(ms)
fs = pd.Series(fs)
df['m'] = ms.values
df['f'] = fs.values

del df['Gender']

### Same as above but doing it for ChestPain

In [17]:
typ = []
nontyp = []
asy = []
nonang = []

for x in df['ChestPain']:
    if x == 'typical':
        typ.append(1)
        nontyp.append(0)
        asy.append(0)
        nonang.append(0)
    elif x == 'nontypical':
        typ.append(0)
        nontyp.append(1)
        asy.append(0)
        nonang.append(0)
    elif x == 'nonanginal':
        typ.append(0)
        nontyp.append(0)
        asy.append(0)
        nonang.append(1)
    elif x =='asymptomatic':
        typ.append(0)
        nontyp.append(0)
        asy.append(1)
        nonang.append(0)

typ = pd.Series(typ)
nontyp = pd.Series(nontyp)
asy = pd.Series(asy)
nonang = pd.Series(nonang)
df['typical'] = typ.values
df['nontypical'] = nontyp.values
df['asymptomatic'] = asy.values
df['nonanginal'] = nonang.values

del df['ChestPain']

### Same as above but doing this for Thal, had to add an aditional else statement because Thal has some missing values represented as NA

In [18]:
fix = []
norm = []
rev = []

for x in df['Thal']:
    if x == 'fixed':
        fix.append(1)
        norm.append(0)
        rev.append(0)
    elif x == 'normal':
        fix.append(0)
        norm.append(1)
        rev.append(0)
    elif x == 'reversable':
        fix.append(0)
        norm.append(0)
        rev.append(1)
    else:
        fix.append(0)
        norm.append(0)
        rev.append(0)
        
fix = pd.Series(fix)
norm = pd.Series(norm)
rev = pd.Series(rev)
df['fixed'] = fix.values
df['normal'] = norm.values
df['reversable'] = rev.values

del df['Thal']


### Running KNN on the edited data set that now has OneHotEncoding

In [19]:
newFeats = ['Age','RestBP','Chol','RestECG','MaxHR','Oldpeak', 'm','f','typical','nontypical','asymptomatic','nonanginal','fixed','normal','reversable']

newx = df[newFeats]

x_train, x_test, y_train, y_test = train_test_split(newx, y, test_size=0.25,random_state=4)

knn.fit(x_train, y_train)
y_predict = knn.predict(x_test)

knnacc = accuracy_score(y_test, y_predict)

print(knnacc)


0.6710526315789473


### Running decistion tree on the edited data set that now has OneHotEncoding

In [20]:
xdttrain, xdttest, ydttrain, ydttest = train_test_split(newx, y, test_size=0.25,random_state=5)

heartDT.fit(xdttrain, ydttrain)

dtPredict = heartDT.predict(xdttest)

dtacc = accuracy_score(ydttest, dtPredict)

print(dtacc)

0.8026315789473685


### Running Logistic regression on the edited data set that now has OneHotEncoding


In [21]:
xlrtrain, xlrtest, ylrtrain, ylrtest = train_test_split(newx, y, test_size=0.25,random_state=4)

heartlogreg.fit(xlrtrain, ylrtrain)

logregPredict = heartlogreg.predict(xlrtest)

logregacc = accuracy_score(ylrtest, logregPredict)

print(logregacc)

0.8157894736842105


#### G) for each method of testing there was an increase of accuracy the one that had the most increase was Logistic Regression which went from 73% to 81% accuracy

### Running Cross-Validation with Decision Tree on the edited data set that now has OneHotEncoding

In [22]:
dtacclist = cross_val_score(heartDT, newx, y, cv=10, scoring='accuracy')
print(dtacclist)

[0.70967742 0.74193548 0.77419355 0.74193548 0.8        0.73333333
 0.63333333 0.56666667 0.66666667 0.68965517]


### Running Logistic Regressionwith Decision Tree on the edited data set that now has OneHotEncoding

In [23]:
logregacclist = cross_val_score(heartlogreg, newx, y, cv=10, scoring='accuracy')
print(logregacclist)

[0.77419355 0.80645161 0.87096774 0.87096774 0.9        0.66666667
 0.8        0.83333333 0.8        0.79310345]


### Running KNN with Decision Tree on the edited data set that now has OneHotEncoding

In [256]:
knnacclist = cross_val_score(knn, newx, y, cv=10, scoring='accuracy')
print(knnacclist)

[0.70967742 0.64516129 0.48387097 0.64516129 0.6        0.46666667
 0.66666667 0.66666667 0.53333333 0.72413793]
