In [8]:
import numpy as np
from scipy import stats

def clean_data(line):
    return line.replace('(', '').replace(')', '').replace(' ', '').strip().split(',')

def fetch_data(filename):
    with open(filename, 'r') as f:
        input_data = f.readlines()
        clean_input = list(map(clean_data, input_data))
        f.close()
    return clean_input

def readFile(dataset_path):
    input_data = fetch_data(dataset_path)
    input_np = np.array(input_data)
    return input_np
    
class GaussianNBClassifier:
    def __init__(self):
        self.prob_y = None
        self.unique_classes = None
        self.num_points = None
        self.num_features = None
        self.num_classes = None
        self.mean_param = None
        self.var_param = None

    def fit(self, X, y):
        self.classes = np.array(list(set(y)))
        self.num_classes = len(self.classes)
        self.num_points, self.num_features = X.shape
        self.prob_y = np.zeros(self.num_classes)
        self.mean_param = np.zeros((self.num_classes, self.num_features))
        self.var_param = np.zeros((self.num_classes, self.num_features))
        self.mle(X, y)

    def mle(self, X, y):
        for idx in range(0, self.num_classes):
            cls_count = len(np.where(y == self.classes[idx])[0])
            cls_idx = np.where(y == self.classes[idx])
            self.prob_y[idx] = cls_count/self.num_points
            self.mean_param[idx, :] = np.sum(X[cls_idx], axis=0)/cls_count
            self.var_param[idx, :] = np.sum(np.power(np.subtract(X[cls_idx], self.mean_param[idx]), 2), axis=0)/cls_count

    def gaussian_dist(self, X, mean, var):
        diff_vec = X - mean
        z_values = np.divide(X - mean, np.sqrt(var))
        dist = (1/np.sqrt(2*np.pi*var)) * np.exp(-0.5 * (z_values**2))
        return dist

    def get_log_bayes_prob(self, X):
        cond_prob = np.zeros((X.shape[0], self.num_classes))
        for i in range(X.shape[0]):
            for j in range(self.num_classes):
                cond_prob[i][j] = np.log(self.prob_y[j]) + np.sum(np.log(self.gaussian_dist(X[i], self.mean_param[j], self.var_param[j])))
        return cond_prob

    def predict(self, X):
        cond_prob = self.get_log_bayes_prob(X)
        max_index = np.argmax(cond_prob, axis=1)
        pred = self.classes[max_index]
        return pred

training_data = '/content/1a-training.txt'
test_data = '/content/1a-test.txt'
large_120_data = '/content/1c-data.txt'

large_np = readFile(large_120_data)
X_large_np = large_np[:, :-1].astype('float')
Y_large_np = large_np[:, -1].astype('str')
#print(X_large_np, Y_large_np)

X_woage = X_large_np[:, :-1]

crct = 0 
total = 0
for i in range(X_woage.shape[0]):
    tst_list = [i]
    trn_list = list(range(X_woage.shape[0]))
    trn_list.remove(i)
    
    loo_X_train = np.take(X_woage, trn_list, axis=0)
    loo_Y_train = np.take(Y_large_np, trn_list, axis=0)

    loo_X_test = X_woage[tst_list]
    loo_Y_test = Y_large_np[tst_list]

    gnb_classifier = GaussianNBClassifier()

    #Fit training data to KNN classifier
    gnb_classifier.fit(loo_X_train, loo_Y_train)
    preds = gnb_classifier.predict(loo_X_test) 
    if preds[0] == loo_Y_test[0]:
        crct += 1
    total += 1

err_percentage = 100 - (100 * crct/total)

print("Error percentage for gaussian naive bayes without age variable is: ", err_percentage)

Error percentage for gaussian naive bayes without age variable is:  29.16666666666667
