In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import eigh
import warnings
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
from itertools import combinations
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_csv('../data/heart_disease_dataset_UCI.csv')
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
standardScalar = StandardScaler()
target_columns = ['age','trestbps','chol','thalach','oldpeak']
dataset[target_columns] = standardScalar.fit_transform(dataset[target_columns])
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.952197,1,3,0.763956,-0.256334,1,0,0.015443,0,1.087338,0,0,1,1
1,-1.915313,1,2,-0.092738,0.072199,0,1,1.633471,0,2.122573,0,0,2,1
2,-1.474158,0,1,-0.092738,-0.816773,0,0,0.977514,0,0.310912,2,0,2,1
3,0.180175,1,1,-0.663867,-0.198357,0,1,1.239897,0,-0.206705,2,0,2,1
4,0.290464,0,0,-0.663867,2.08205,0,1,0.583939,1,-0.379244,2,0,2,1


In [4]:
#Dividing into input and output
X= dataset.drop(['target'], axis=1)
Y= dataset['target']
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.25, random_state=40)

In [5]:
def dist(x, y, p):
    dim = len(x)
    dist = 0
    for i in range(dim):
        dist += abs(x[i] - y[i])**p
    return dist**(1/p)

In [6]:
def knn_sfs(distfunc, X_train, Y_train, k):
    accuracy = 0
    for p in X_train.index:
        Xtraindataselect = X_train.drop(index=p)
        leaveone = X_train.loc[p].to_numpy()
        Xtraindataselect = Xtraindataselect.to_numpy()
        Ytraindataselect = Y_train.drop(index=p)
        leaveonetrain = Y_train.loc[p]
        
        Y_prediction = []
        distances = []
                
        for j in Xtraindataselect:
            distances.append(dist(leaveone,j,distfunc))
        df_dist = pd.DataFrame(index=Ytraindataselect.index, columns=['dist'], data=distances)
        df_neighbors = df_dist.sort_values(by=['dist'], axis=0)[:k]
        neighbortally = Counter(Ytraindataselect[df_neighbors.index])
        Y_prediction.append(neighbortally.most_common(1)[0][0])
        if(leaveonetrain == Y_prediction):
                accuracy += 1
        Xtraindataselect = []
        Ytraindataselect = []
        leaveonetrain = []
    accuracy = accuracy/len(X_train.index)
    return accuracy

In [7]:
# Calculate the minkowski distance between every new data point in X_test and all points in X_train, show accuracy
def knn (distfunction, X_train_arr, X_test_arr, Y_train, Y_test, k):
    Y_prediction = []
    for i in X_test_arr:
        distances = []
        for j in X_train_arr:
            distances.append(dist(i,j,distfunction))
        df_dist = pd.DataFrame(index=Y_train.index, columns=['dist'], data=distances)
        df_neighbors = df_dist.sort_values(by=['dist'], axis=0)[:k]
        neighbortally = Counter(Y_train[df_neighbors.index])
        Y_prediction.append(neighbortally.most_common(1)[0][0])

    #cmknn = confusion_matrix(Y_test, Y_prediction)
    #print("No Dim Reduce |", distfunc, "| K Neighbors =", k,'\n', cmknn)
    #print(Y_prediction)
    #print("\nAccuracy=", accuracy_score(Y_test, Y_prediction),"\n")
    return accuracy_score(Y_test, Y_prediction)

In [8]:
def sfs(distfunc, k):
    #SFS

    sfsmax = 6
    selectfeat = []
    tempselect = []
    features = []
    traindataselect = []
    bestaccuracy = 0
    accuracy = 0
    bestfeat = ''
    features = list(X_train.columns)
    #print(features)

    while(len(selectfeat) < sfsmax):
        for j in range(len(selectfeat)):
                tempselect.append(selectfeat[j])
        for i in range(len(features)):
            tempselect.append(features[i])
            print('Featureset: {}'.format(tempselect))
            accuracy = knn_sfs(distfunc,X_train[tempselect],Y_train,k)
            print('Accuracy: {}'.format(accuracy))
            if (bestaccuracy < accuracy):
                bestaccuracy = accuracy
                bestfeat = features[i]
            tempselect.pop(-1)

        selectfeat.append(bestfeat)
        features.remove(bestfeat)
        print("============================================")
        print("\nBest feature to add: {}".format(bestfeat))
        print("Accuracy with feature addition: {}".format(bestaccuracy))
        print("Total features added: {}\n".format(len(selectfeat)))
        print("============================================")
        bestaccuracy = 0
        del tempselect[:]


    print("Final featureset: {}".format(selectfeat))
    return selectfeat


In [9]:
def pca_scratch(selectfeat, X):
    X_mean = np.mean(X[selectfeat].to_numpy(),axis=0)
    X_centered = X[selectfeat].to_numpy() - X_mean
    val, vect = eigh(np.cov(X_centered, rowvar = False))
    X_reduced = np.dot(vect.T,X_centered.T)
    return X_reduced

In [10]:
def lda_scratch(X_train, Y_train, features):
    y = [0,1]
    Scatter_W = np.zeros((features,features))
    Scatter_B = np.zeros((features,features))
    total_mean = np.mean(X_train,axis = 0)
    
    for i in y:
        X_class =  X_train[i == Y_train]
        class_mean = np.mean(X_class,axis = 0)
        Scatter_W += np.dot((X_class - class_mean).T,(X_class - class_mean))
        
        class_members = X_class.shape[0]
        mean_diff = (class_mean-total_mean).reshape(features,1)
        Scatter_B += class_members * (np.dot(mean_diff,mean_diff.T))
    val, vect = eigh(np.dot(np.linalg.inv(Scatter_W),Scatter_B))
    X_reduced = np.dot(X_train, vect)
    return X_reduced

In [11]:
X_train_arr = X_train.to_numpy()
X_test_arr = X_test.to_numpy()
k = 5
distfunc = 1 # P value for minkowski distance
knn(distfunc, X_train_arr, X_test_arr, Y_train, Y_test, k)
#[manhattan = 1, euclidean = 2]

0.8947368421052632

In [None]:
featureset = sfs (distfunc,k)
knn(distfunc, X_train[featureset].to_numpy(), X_test[featureset].to_numpy(), Y_train, Y_test, k)

Featureset: ['age']
Accuracy: 0.5198237885462555
Featureset: ['sex']
Accuracy: 0.5418502202643172
Featureset: ['cp']
Accuracy: 0.6123348017621145
Featureset: ['trestbps']
Accuracy: 0.5066079295154186
Featureset: ['chol']
Accuracy: 0.5903083700440529
Featureset: ['fbs']
Accuracy: 0.5198237885462555
Featureset: ['restecg']
Accuracy: 0.5550660792951542
Featureset: ['thalach']
Accuracy: 0.6211453744493393
Featureset: ['exang']
Accuracy: 0.7004405286343612
Featureset: ['oldpeak']
Accuracy: 0.6784140969162996
Featureset: ['slope']
Accuracy: 0.5859030837004405
Featureset: ['ca']
Accuracy: 0.7312775330396476
Featureset: ['thal']
Accuracy: 0.7444933920704846

Best feature to add: thal
Accuracy with feature addition: 0.7444933920704846
Total features added: 1

Featureset: ['thal', 'age']
Accuracy: 0.7136563876651982
Featureset: ['thal', 'sex']
Accuracy: 0.73568281938326
Featureset: ['thal', 'cp']
Accuracy: 0.7444933920704846
Featureset: ['thal', 'trestbps']
Accuracy: 0.7004405286343612
Featurese

In [None]:
pca = PCA()

traindataselect = X_train[featureset]
pca.fit(traindataselect,Y_train)
A = pca.transform(traindataselect)
B = pca.transform(X_test[featureset])

print(knn(distfunc,A,B,Y_train,Y_test,k))
pca.fit(X_train,Y_train)
C = pca.transform(X_train)
D = pca.transform(X_test)
print(knn(distfunc, C, D, Y_train, Y_test, k))

E = pca_scratch(featureset,X_train).T
F = pca_scratch(featureset,X_test).T
print(knn(distfunc,E,F,Y_train, Y_test,k))

In [None]:
lda = LinearDiscriminantAnalysis()

traindataselect = X_train[featureset]
lda.fit(traindataselect,Y_train)
A = lda.transform(traindataselect)
B = lda.transform(X_test[featureset])
print(knn(distfunc,A,B,Y_train,Y_test,k))

lda.fit(X_train,Y_train)
C = lda.transform(X_train)
D = lda.transform(X_test)
print(knn(distfunc, C, D, Y_train, Y_test, k))

E = lda_scratch(traindataselect.to_numpy(),Y_train,6)
F = lda_scratch(X_test[featureset].to_numpy(),Y_test,6)
print(knn(distfunc,E,F,Y_train, Y_test,k))