In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from collections import Counter
import operator
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing as pre
import itertools
import time

In [2]:
df = pd.read_csv('../data/heart.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [5]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [6]:
class_grp = df.groupby(['target'])

In [7]:
class_analysis = class_grp[['age','trestbps','chol','thalach','oldpeak']].agg(['mean','median','min','max']).T

In [8]:
class_analysis['Difference (0 - 1)'] = class_analysis[0].sub(class_analysis[1])

In [9]:
class_analysis

Unnamed: 0,target,0,1,Difference (0 - 1)
age,mean,56.601449,52.49697,4.10448
age,median,58.0,52.0,6.0
age,min,35.0,29.0,6.0
age,max,77.0,76.0,1.0
trestbps,mean,134.398551,129.30303,5.09552
trestbps,median,130.0,130.0,0.0
trestbps,min,100.0,94.0,6.0
trestbps,max,200.0,180.0,20.0
chol,mean,251.086957,242.230303,8.856653
chol,median,249.0,234.0,15.0


In [10]:
cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang']
for cols in cat_cols:
    print(class_grp[cols].value_counts(normalize = True))
    print()

target  sex
0       1      0.826087
        0      0.173913
1       1      0.563636
        0      0.436364
Name: sex, dtype: float64

target  cp
0       0     0.753623
        2     0.130435
        1     0.065217
        3     0.050725
1       2     0.418182
        1     0.248485
        0     0.236364
        3     0.096970
Name: cp, dtype: float64

target  fbs
0       0      0.840580
        1      0.159420
1       0      0.860606
        1      0.139394
Name: fbs, dtype: float64

target  restecg
0       0          0.572464
        1          0.405797
        2          0.021739
1       1          0.581818
        0          0.412121
        2          0.006061
Name: restecg, dtype: float64

target  exang
0       1        0.550725
        0        0.449275
1       0        0.860606
        1        0.139394
Name: exang, dtype: float64



In [11]:
train_df, test_df = train_test_split(df, test_size = 0.2, random_state = 1692, shuffle = True)
train_df, val_df = train_test_split(train_df, test_size = 0.25, random_state = 2098, shuffle = True)

In [12]:
def df_to_numpy(data):
    data_labels_np = data.pop('target').to_numpy()
    data_features_np = data.to_numpy()
    return data_features_np,data_labels_np

X_train, y_train = df_to_numpy(train_df)
X_val, y_val = df_to_numpy(val_df)
X_test, y_test = df_to_numpy(test_df)

In [13]:
def euclidianDist(array1, array2):
    diff_array = array1 - array2
    sq_diff_array = diff_array ** 2
    sq_distance = sq_diff_array.sum(axis = 1)
    distance_array = sq_distance ** 0.5
    return distance_array

def hammingDist(array1, array2):
    diff_array = np.absolute(array1 - array2)
    sum_array = diff_array.sum(axis = 1)
    hammingDistance = sum_array/array1.shape[1]
    return hammingDistance

def manhattanDist(array1, array2):
    diff_array = np.absolute(array1 - array2)
    sum_array = diff_array.sum(axis = 1)
    return sum_array

def minkowskiDist(array1, array2, p):
    diff_array = np.absolute(array1 - array2)
    powerP = diff_array ** p
    sum_array = powerP.sum(axis = 1)
    minkowskiDist = sum_array ** (1/p)
    return minkowskiDist
    
    
    
    

    
def kNN(test_array, Xtrain, ytrain, k, choice, p = 0):
    predictions = []
    for instance in test_array:
        #create an array by duplicating instance
        instance_array = np.tile(instance, (ytrain.shape[0],1))
        #calculate euclidian distance
        if choice == 0:
            distArray = euclidianDist(Xtrain, instance_array)
        if choice == 1:
            distArray = hammingDist(Xtrain, instance_array)
        if choice == 2:
            distArray = manhattanDist(Xtrain, instance_array)
        if choice == 3:
            distArray = minkowskiDist(Xtrain, instance_array, p)
        #sort using argsort
        sortIndices_k = distArray.argsort()[0:k]
        labels_k = ytrain[sortIndices_k]
        countLabels = dict(Counter(labels_k))
        predictions.append(max(countLabels.items(), key=operator.itemgetter(1))[0])
    return np.array(predictions)
    
# kNN(X_val, X_train, y_train, 5, 0, 2)

    
    
    
    

In [14]:
def parameterTuning(Xval, yval, Xtrain, ytrain):
    parameter = {}
    max_f1 = 0
    for i in range(1,20):
        p = 0.1
        for choice in [0,1,2,3]:
            if choice == 3:
                for j in range(30):
                    y_predict = kNN(Xval, Xtrain, ytrain, i, choice, p)
                    f1 = f1_score(yval, y_predict)
                    if f1 > max_f1:
                        max_f1 = f1
                        parameter['k'] = i
                        parameter['p'] = p
                        parameter['score'] = max_f1
                        parameter['choice'] = choice
                    p += 0.1
            else:
                y_predict = kNN(Xval, Xtrain, ytrain, i, choice)
                f1 = f1_score(yval, y_predict)
                if f1 > max_f1:
                    max_f1 = f1
                    parameter['k'] = i
                    parameter['p'] = -1
                    parameter['score'] = max_f1
                    parameter['choice'] = choice

    return parameter

# parameterTuning(X_val,y_val, X_train, y_train)
    
    

In [15]:
# Using kNN Base Accuracy 0.8125000000000001
y_predict = kNN(X_val, X_train, y_train, 5, 3, 2)
f1_score(y_val, y_predict)

0.603174603174603

In [16]:
# Using Sklearn Base Accuracy
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)
y_predict = neigh.predict(X_val)
f1_score(y_val, y_predict)

0.603174603174603

In [17]:
# After Parameter Tuning
y_predict = kNN(X_val, X_train, y_train, 11, 3, 0.23)
f1_score(y_val, y_predict)

0.8307692307692307

In [18]:
# On test usig Sklearn
f1_score(y_test,neigh.predict(X_test))

0.7605633802816901

In [19]:
# On test kNN Parameter tuned
f1_score(y_test,kNN(X_test, X_train, y_train, 11, 3, 0.23))

0.8285714285714285

In [20]:
def label_to_onehot(tempdf, name):
    cat_temp = pd.get_dummies(tempdf[name], prefix = name)
    tempdf = pd.concat([tempdf,cat_temp], axis = 1).drop(labels = name, axis = 1)
    return tempdf


In [21]:
def standardize(X):  
    scaler = pre.StandardScaler().fit(X)
    X = scaler.transform(X)
    return scaler,X


#range 0 - 1 (sparse data)
def min_max(X):
    scaler = pre.MinMaxScaler().fit(X)
    X = scaler.transform(X)
    return scaler,X

# range -1  - 1 (sparse data)
def max_abs(X): 
    scaler = pre.MaxAbsScaler().fit(X)
    X = scaler.transform(X)
    return scaler,X

#for ouliers
def robust(X):
    scaler = pre.RobustScaler().fit(X)
    X = scaler.transform(X)
    return scaler,X

def normalize(X):
    scaler = pre.Normalizer().fit(X)
    X = scaler.transform(X)
    return scaler,X

def quantile(X):
    scaler = pre.QuantileTransformer().fit(X)
    X = scaler.transform(X)
    return scaler,X


In [22]:
n = 9
table = list(itertools.product([False,True], repeat=n))
table

[(False, False, False, False, False, False, False, False, False),
 (False, False, False, False, False, False, False, False, True),
 (False, False, False, False, False, False, False, True, False),
 (False, False, False, False, False, False, False, True, True),
 (False, False, False, False, False, False, True, False, False),
 (False, False, False, False, False, False, True, False, True),
 (False, False, False, False, False, False, True, True, False),
 (False, False, False, False, False, False, True, True, True),
 (False, False, False, False, False, True, False, False, False),
 (False, False, False, False, False, True, False, False, True),
 (False, False, False, False, False, True, False, True, False),
 (False, False, False, False, False, True, False, True, True),
 (False, False, False, False, False, True, True, False, False),
 (False, False, False, False, False, True, True, False, True),
 (False, False, False, False, False, True, True, True, False),
 (False, False, False, False, False, T

In [23]:
def predictScaler(val_df, train_df, scaler = None, usekNN = True):
    parameterDict = {}
    if scaler is not None:
        val_df.loc[:,['age','trestbps','chol','oldpeak','thalach']] = scaler.transform(val_df.loc[:,['age','trestbps','chol','oldpeak','thalach']])
    X_train, y_train = df_to_numpy(train_df)
    X_val, y_val = df_to_numpy(val_df)
    if usekNN:
        parameterDict = parameterTuning(X_val, y_val, X_train, y_train)
        return parameterDict
    neigh.fit(X_train, y_train)
    y_predict = neigh.predict(X_val)
    s = f1_score(y_val, y_predict)
    parameterDict['k'] = 5
    parameterDict['choice'] = 3
    parameterDict['score'] = s
    parameterDict['p'] = 2
    return parameterDict
    
    
def generatePrediction(bool_array, df):
    tempDF = df
    global log_df
    logDict = {"sex": bool_array[0],
               "cp": bool_array[1],
               "fbs": bool_array[2],
               "restecg": bool_array[3],
               "exang": bool_array[4],
               "slope": bool_array[5],
               "ca": bool_array[6],
               "thal": bool_array[7],
               "scaling": bool_array[8]}
    if bool_array[0]:
        tempDF = label_to_onehot(tempDF, 'sex')
    if bool_array[1]:
        tempDF = label_to_onehot(tempDF, 'cp')
    if bool_array[2]:
        tempDF = label_to_onehot(tempDF, 'fbs')
    if bool_array[3]:
        tempDF = label_to_onehot(tempDF, 'restecg')
    if bool_array[4]:
        tempDF = label_to_onehot(tempDF, 'exang')
    if bool_array[5]:
        tempDF = label_to_onehot(tempDF, 'slope')
    if bool_array[6]:
        tempDF = label_to_onehot(tempDF, 'ca')
    if bool_array[7]:
        tempDF = label_to_onehot(tempDF, 'thal')
        
    train_df, test_df = train_test_split(tempDF, test_size = 0.2, random_state = 1692, shuffle = True)
    train_df, val_df = train_test_split(train_df, test_size = 0.25, random_state = 2098, shuffle = True)
    
    if bool_array[8]:
        for i in range(5):
            train_df_scale = train_df.copy()
            val_df_scale = val_df.copy()
            if i==0:
                scaler, train_df_scale.loc[:,['age','trestbps','chol','oldpeak','thalach']] =  standardize(train_df_scale.loc[:,['age','trestbps','chol','oldpeak','thalach']])
                paraDict = predictScaler(val_df_scale, train_df_scale, scaler)
                logDict['scaler'] = 'standard'
                logDict['score'] = paraDict['score']
                logDict['k'] = paraDict['k']
                logDict['p'] = paraDict['p']
                logDict['choice'] = paraDict['choice']
                
                log_df = log_df.append(logDict, ignore_index=True)
            elif i == 1:
                scaler, train_df_scale.loc[:,['age','trestbps','chol','oldpeak','thalach']] =  min_max(train_df_scale.loc[:,['age','trestbps','chol','oldpeak','thalach']])
                paraDict = predictScaler(val_df_scale, train_df_scale, scaler)
                logDict['scaler'] = 'min_max'
                logDict['score'] = paraDict['score']
                logDict['k'] = paraDict['k']
                logDict['p'] = paraDict['p']
                logDict['choice'] = paraDict['choice']
                log_df = log_df.append(logDict, ignore_index=True)
                
            elif i==2:
                scaler, train_df_scale.loc[:,['age','trestbps','chol','oldpeak','thalach']] =  max_abs(train_df_scale.loc[:,['age','trestbps','chol','oldpeak','thalach']])
                paraDict = predictScaler(val_df_scale, train_df_scale, scaler)
                logDict['scaler'] = 'max_abs'
                logDict['score'] = paraDict['score']
                logDict['k'] = paraDict['k']
                logDict['p'] = paraDict['p']
                logDict['choice'] = paraDict['choice']
                log_df = log_df.append(logDict, ignore_index=True)
            elif i == 3:
                scaler, train_df_scale.loc[:,['age','trestbps','chol','oldpeak','thalach']] =  robust(train_df_scale.loc[:,['age','trestbps','chol','oldpeak','thalach']])
                paraDict = predictScaler(val_df_scale, train_df_scale, scaler)
                logDict['scaler'] = 'robust'
                logDict['score'] = paraDict['score']
                logDict['k'] = paraDict['k']
                logDict['p'] = paraDict['p']
                logDict['choice'] = paraDict['choice']
                log_df = log_df.append(logDict, ignore_index=True)
            elif i==4:
                scaler, train_df_scale.loc[:,['age','trestbps','chol','oldpeak','thalach']] =  normalize(train_df_scale.loc[:,['age','trestbps','chol','oldpeak','thalach']])
                paraDict = predictScaler(val_df_scale, train_df_scale, scaler)
                logDict['scaler'] = 'normalize'
                logDict['score'] = paraDict['score']
                logDict['k'] = paraDict['k']
                logDict['p'] = paraDict['p']
                logDict['choice'] = paraDict['choice']
                log_df = log_df.append(logDict, ignore_index=True)
    else:
        paraDict = predictScaler(val_df, train_df)
        logDict['scaler'] = 'None'
        logDict['score'] = paraDict['score']
        logDict['k'] = paraDict['k']
        logDict['p'] = paraDict['p']
        logDict['choice'] = paraDict['choice']
        log_df = log_df.append(logDict, ignore_index=True)
    
    
        
        
        
                
# log_df = pd.DataFrame()
# generatePrediction((False,True,True,False,True,True,False,True,True), df)
# log_df

        
        

In [None]:
log_df = pd.DataFrame()
count = 1
for value in table:
    print(count)
    generatePrediction(value, df)
    if len(log_df) > 10*count:
        log_df.to_csv('LogMetric_k_p_1.csv')
        count+=1

In [None]:
log_df.sort_values(by='score', ascending = False)

In [68]:
tempDF = label_to_onehot(df, 'cp')
tempDF = label_to_onehot(tempDF, 'fbs')
# tempDF = label_to_onehot(tempDF, 'sex')
tempDF = label_to_onehot(tempDF, 'exang')
tempDF = label_to_onehot(tempDF, 'slope')
tempDF = label_to_onehot(tempDF, 'thal')

tempDF

Unnamed: 0,age,sex,trestbps,chol,restecg,thalach,oldpeak,ca,target,cp_0,...,fbs_1,exang_0,exang_1,slope_0,slope_1,slope_2,thal_0,thal_1,thal_2,thal_3
0,63,1,145,233,0,150,2.3,0,1,0,...,1,1,0,1,0,0,0,1,0,0
1,37,1,130,250,1,187,3.5,0,1,0,...,0,1,0,1,0,0,0,0,1,0
2,41,0,130,204,0,172,1.4,0,1,0,...,0,1,0,0,0,1,0,0,1,0
3,56,1,120,236,1,178,0.8,0,1,0,...,0,1,0,0,0,1,0,0,1,0
4,57,0,120,354,1,163,0.6,0,1,1,...,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,1,123,0.2,0,0,1,...,0,0,1,0,1,0,0,0,0,1
299,45,1,110,264,1,132,1.2,0,0,0,...,0,1,0,0,1,0,0,0,0,1
300,68,1,144,193,1,141,3.4,2,0,1,...,1,1,0,0,1,0,0,0,0,1
301,57,1,130,131,1,115,1.2,1,0,1,...,0,0,1,0,1,0,0,0,0,1


In [69]:
train_df, test_df = train_test_split(tempDF, test_size = 0.2, random_state = 1692, shuffle = True)
train_df, val_df = train_test_split(train_df, test_size = 0.25, random_state = 2098, shuffle = True)

In [70]:
scaler, train_df.loc[:,['age','trestbps','chol','oldpeak','thalach']] =  max_abs(train_df.loc[:,['age','trestbps','chol','oldpeak','thalach']])
val_df.loc[:,['age','trestbps','chol','oldpeak','thalach']] =  scaler.transform(val_df.loc[:,['age','trestbps','chol','oldpeak','thalach']])

In [71]:
X_train, y_train = df_to_numpy(train_df)
X_val, y_val = df_to_numpy(val_df)
X_test, y_test = df_to_numpy(test_df)

In [72]:
y_predict = kNN(X_val, X_train, y_train, 11, 3, 1.1)
f1_score(y_val, y_predict)

0.9180327868852458

In [73]:
y_predict = kNN(X_test, X_train, y_train, 11, 3, 1.1)
f1_score(y_test, y_predict)

0.8695652173913043