<a href="https://colab.research.google.com/github/beny2000/HeartDiseaseClassifier/blob/main/FINAL_RBFNN_A%26D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#connect to anvil 
!pip install anvil-uplink
import anvil.server
anvil.server.connect("2ZZFEIO7RSWJ2NSXVOIJU7NK-PUMVCYIB7OL5ZPSL")


import sys
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import keras
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn import model_selection
from sklearn import preprocessing
from importlib import reload
reload(sns)

#load data set 
cleveland = pd.read_csv('heart.csv')
data = cleveland
data.columns = ['Age','Sex','CP', 'BP','Cholesterol','BS', 'RestECG_Results','MaxHR', 'exind_Angina', 'ST_Depression','ST_Slope','MajorVessels','Thal_Test',"HD"]

#separating into x and y data sets
preX = np.array(data.drop(['HD'], 1)) #all other data 
y = np.array(data['HD']) 

#preprocess data
X = preprocessing.normalize(preX)

#create test and train datasets 
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
stratify=y, random_state=0, test_size = 0.2)


#pythagorean calculation
def get_distance(x1, x2):
    sum = 0
    for i in range(len(x1)):
        sum += (x1[i] - x2[i]) ** 2
    return np.sqrt(sum)

#modified kmeans, returns the cluster centroids and the standard deviations 
def kmeans(X, k, max_iters): 
    np.random.seed(43)

    centroids = X[np.random.choice(range(len(X)), k, replace=False)] #centroids 
    converged = False
    
    current_iter = 0

    while (not converged) and (current_iter < max_iters):

        cluster_list = [[] for i in range(len(centroids))]

        for x in X:  # Go through each data point
            distances_list = []
            for c in centroids:
                distances_list.append(get_distance(c, x))
            cluster_list[int(np.argmin(distances_list))].append(x) ##list of min distances to centroids

        cluster_list = list((filter(None, cluster_list)))

        prev_centroids = centroids.copy()

        centroids = []

        for j in range(len(cluster_list)):
            centroids.append(np.mean(cluster_list[j], axis=0))

        pattern = np.abs(np.sum(prev_centroids) - np.sum(centroids))

        converged = (pattern == 0)

        current_iter += 1

    return np.array(centroids), [np.std(x) for x in cluster_list]


##RBF class
class RBF:

    def __init__(self, U, X, y, tX, ty, num_of_classes,
                 k, std_from_clusters):
        self.X = X
        self.y = y

        self.tX = tX
        self.ty = ty

        self.number_of_classes = num_of_classes
        self.k = k
        self.std_from_clusters = std_from_clusters
        self.U = U
        
    #converting array to a one-hot encoded 2D matrix (i.e. 2 = 0 0 1 0)
    def convert_to_one_hot(self, x, num_of_classes): 
        arr = np.zeros((len(x), num_of_classes)) 
        for i in range(len(x)): 
            c = int(x[i]) 
            arr[i][c] = 1 
        return arr

    #calculating exponential function ## using beta = 1/std^2 
    def rbf(self, x, c, s):
        distance = get_distance(x, c)
        return np.exp((-1/(2*(s**2)))*(distance**2))

    # creates list of the betas
    def rbf_list(self, X, centroids, std_list):
        RBF_list = []
        for x in X:
            RBF_list.append([self.rbf(x, c, s) for (c, s) in zip(centroids, std_list)])
        return np.array(RBF_list)

    def fit(self):

        #finding the centroids and the standard deviiation of datapoints to centroids
        self.centroids, self.std_list = kmeans(self.X, self.k, max_iters=100000)
        self.cent = self.centroids

        #enter if std_from_clusters is false (other method for finding beta, this was determined to be less accurate that 1/std^2 method)
        if not self.std_from_clusters: ##Calculate betas using beta = sqrt(2*k)/dmax 
            dMax = np.max([get_distance(c1, c2) for c1 in self.centroids for c2 in self.centroids]) #dmax = max distance between centroids
            self.std_list = np.repeat(np.sqrt(2 * self.k)/dMax, self.k)
            

        RBF_X = self.rbf_list(self.X, self.centroids, self.std_list)

        #optimizing for weights using least squares regression 
        self.w = np.linalg.pinv(RBF_X.T @ RBF_X) @ RBF_X.T @ self.convert_to_one_hot(self.y, self.number_of_classes)
       
        #test and calculate for accuracy 
        RBF_list_tst = self.rbf_list(self.tX, self.centroids, self.std_list)
        self.pred_ty = RBF_list_tst @ self.w
        self.pred_ty = np.array([np.argmax(x) for x in self.pred_ty])
        diff = self.pred_ty - self.ty
        Accuracy = len(np.where(diff == 0)[0]) / len(diff)
       
        #run user data
        RBF_U = self.rbf_list(self.U, self.centroids, self.std_list)
        self.pred_y = RBF_U @ self.w
        return self.pred_y, Accuracy


Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment (dev)" as SERVER


In [None]:
##callable function 
@anvil.server.callable
def get_prediction(Age, Sex, CP, BP, Cholesterol, BS,RestECG_Results,MaxHR,exind_Angina,ST_Depression,ST_Slope,MajorVessels,Thal_Test):
  preUser = [Age, Sex, CP, BP, Cholesterol, BS,RestECG_Results,MaxHR,exind_Angina,ST_Depression,ST_Slope,MajorVessels,Thal_Test]
  #normalize user data 
  User = preprocessing.normalize([preUser])
  #run RBF model 
  RBF_CLASSIFIER = RBF(User,X_train, y_train, X_test, y_test, num_of_classes=max(y_train)+1,
                      k=28, std_from_clusters=True)
  Output = RBF_CLASSIFIER.fit()
  print(Output)
  Acc = round(Output[1]*100,2)
  Heart = max(Output[0])
  yn = np.argmax(Heart)
  HD = round(max(Heart)*100,2)
  return yn, Acc




In [None]:
anvil.server.wait_forever()

(array([[0.70738059, 0.2978751 ]]), 0.8852459016393442)
(array([[0.07287669, 0.92504605]]), 0.8852459016393442)
(array([[0.14246154, 0.85509569]]), 0.8852459016393442)
(array([[ 1.21303506, -0.21274632]]), 0.8852459016393442)


Overall Findings:

For severity levels - best beta is option 2, with a k of 71 

For binary heart disease - best beta is option 1, with a k of 37
![image.png]()
