In [21]:
#   import necessary dependencies
import numpy as np
import pandas as pd
import numpy.matlib
import matplotlib.pyplot as plt
import arff
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

print("dependencies imported")

dependencies imported


In [22]:
#   load the data
filename = '../chronic_kidney_disease_full.arff'
data = arff.load(open('../chronic_kidney_disease_full.arff'))
df = pd.DataFrame(data['data'])

#last column of df is our target column in terms of ckd, notckd
target = df.iloc[:, -1:]
df.drop(df.columns[[24]], axis=1, inplace=True)

target = target.rename({24: 1}, axis=1)
target = target.replace("ckd", 1)
target = target.replace("notckd", 0)
target = target.T
target = target.to_numpy()[0]
# print(target)

# format data oh god
df = df.replace("yes", 1)
df = df.replace("no", 0)
df = df.replace("present", 1)
df = df.replace("notpresent", 0)
df = df.replace("normal", 1)
df = df.replace("abnormal", 0)
df = df.replace("good", 1)
df = df.replace("poor", 0)
df = df.replace("1.005", 1.005)
df = df.replace("1.010", 1.01)
df = df.replace("1.015", 1.015)
df = df.replace("1.020", 1.020)
df = df.replace("1.025", 1.025)
df = df.replace("0", 0)
df = df.replace("1", 1)
df = df.replace("2", 2)
df = df.replace("3", 3)
df = df.replace("4", 4)
df = df.replace("5", 5)
df = df.replace("None", -1)
df = df.fillna(-1)

# normalize data
scaler = StandardScaler()
df = scaler.fit_transform(df)
print("Data mean should be near 0:", df.mean())
print("Data standard deviation should be 1:", df.std())
print("-----")
dataDf = pd.DataFrame(df)
targetDf = pd.DataFrame(target)
data = np.array(df)
target = np.array(target)

# print(data.shape)
# print(target.shape)

Data mean should be near 0: 1.258252761241844e-17
Data standard deviation should be 1: 1.0
-----


In [23]:
#   function to calculate f-measure
def f_measure(pred, y_test):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(len(pred)):
        currPred = pred[i]
        currTest = y_test[i]

        if currPred == 1 and currTest == 1:
            TP+=1
        elif currPred == 0 and currTest == 0:
            TN+=1
        elif currPred == 1 and currTest == 0:
            FP+=1
        elif currPred == 0 and currTest == 1:
            FN+=1
    
    pre = TP/(TP+FP)
    rec = TP/(TP+FN)
    return (2 * pre * rec)/(pre + rec)


In [24]:
#   (a) Support Vector Machine w/ linear kernal and default parameters

#split data
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=.2, random_state=5)
print("X_train : " + str(X_train.shape))
print("y_train : " + str(y_train.shape))
print("X_test : " + str(X_test.shape))
print("y_test : " + str(y_test.shape))

clf = make_pipeline(StandardScaler(), SVC(kernel='linear'))
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
# print(pred)
# print(y_test)

# calculate f-measure
print("F-Measure: {}".format(f_measure(pred, y_test)))

X_train : (320, 24)
y_train : (320,)
X_test : (80, 24)
y_test : (80,)
F-Measure: 0.9885057471264368


In [25]:
#   (b) Support Vector Machine w/ rbf kernal and default parameters

#split data
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=.2, random_state=5)
print("X_train : " + str(X_train.shape))
print("y_train : " + str(y_train.shape))
print("X_test : " + str(X_test.shape))
print("y_test : " + str(y_test.shape))

clf = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
# print(pred)
# print(y_test)

# calculate f-measure
print("F-Measure: {}".format(f_measure(pred, y_test)))

X_train : (320, 24)
y_train : (320,)
X_test : (80, 24)
y_test : (80,)
F-Measure: 0.9885057471264368


In [26]:
#   (c) Random forest with default parameters

#split data
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=.2, random_state=5)
print("X_train : " + str(X_train.shape))
print("y_train : " + str(y_train.shape))
print("X_test : " + str(X_test.shape))
print("y_test : " + str(y_test.shape))

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
# print(pred)
# print(y_test)

#calculate f-measure
print("F-Measure: {}".format(f_measure(pred, y_test)))

X_train : (320, 24)
y_train : (320,)
X_test : (80, 24)
y_test : (80,)
F-Measure: 1.0
