In [1]:
import numpy as np
from scipy import stats


KeyboardInterrupt



In [None]:
def clean_data(line):
    return line.replace('(', '').replace(')', '').replace(' ', '').strip().split(',')

In [None]:
def fetch_data(filename):
    with open(filename, 'r') as f:
        input_data = f.readlines()
        clean_input = list(map(clean_data, input_data))
        f.close()
    return clean_input

In [None]:
def readFile(dataset_path):
    input_data = fetch_data(dataset_path)
    input_np = np.array(input_data)
    return input_np

In [None]:
def load_data(file_path):
    with open(file_path, 'r') as input_file:
        df = pd.read_csv(input_file, sep=',', header=None)
    input_data = np.array(df[df.columns[0:-1]])
    output_data = df[df.columns[-1]]
    return input_data, output_data

In [None]:
class StandardScaling:
    def __init__(self):
        self.std = None
        self.mean = None

    def fit_transform(self, data):
        self.std = np.std(data, axis=0)
        self.mean = np.mean(data, axis=0)
        transformed_data = np.subtract(data, self.mean)
        transformed_data = np.divide(transformed_data, self.std)
        return transformed_data

    def transform(self, data):
        transformed_data = np.subtract(data, self.mean)
        transformed_data = np.divide(transformed_data, self.std)
        return transformed_data

# Question 1-b
## KNN Classifier Code

In [None]:
class KNearestNeighborClassifier:
    def __init__(self, logging = False):
        self.classes = None
        self.num_points = None
        self.num_features = None
        self.num_classes = None
        self.X_train = None
        self.y_train = None
        self.logging = logging

    def fit(self, X, y):
        self.classes = np.array(list(set(y)))
        self.num_classes = len(self.classes)
        self.num_points, self.num_features = X.shape
        self.X_train = X
        self.y_train = y

    def compute_distance(self, X, distance_metric):
        if distance_metric == 'manhattan':
            dist_mat = np.linalg.norm(X[:, None, :] - self.X_train[None, :, :], ord=1, axis=-1)
        elif distance_metric == 'cartesian':
            dist_mat = np.linalg.norm(X[:, None, :] - self.X_train[None, :, :], ord=2, axis=-1)
        elif distance_metric == 'minkowski':
            dist_mat = np.linalg.norm(X[:, None, :] - self.X_train[None, :, :], ord=3, axis=-1)
        else:
            raise Exception("Please enter valid distance metric")
        return dist_mat

    def predict(self, X, k, distance_metric='cartesian'):
        [m, n] = X.shape
        preds = np.zeros((m, 1))
        dist_matrix = self.compute_distance(X, distance_metric)
        if self.logging:
            print("\tDistance Matrix is: ", dist_matrix)
        sorted_dist_indices = np.argsort(dist_matrix, axis = 1)[:, :k]
        if self.logging:
            print("\tNeighbor Indices is: ", sorted_dist_indices)
        preds = self.y_train[sorted_dist_indices]
        preds = stats.mode(preds, axis=1).mode.reshape(-1)
        if self.logging:
            print("\tPredictions are: ", preds)
        return preds

In [None]:
def compute_accuracy(targets, pred):
    acc = (len(np.where(targets == pred)[0]) / len(pred)) * 100
    return acc

## Loading Datasets

In [None]:
training_data = '1a-training.txt'
test_data = '1a-test.txt'
large_120_data = '1c-data.txt'

train_np = readFile(training_data)
X_train_np = train_np[:, :-1].astype('float')
Y_train_np = train_np[:, -1].astype('str')
#print(X_train_np, Y_train_np)

test_np = readFile(test_data)
X_test_np = test_np.astype('float')
#print(X_test_np, Y_test_np)

large_np = readFile(large_120_data)
X_large_np = large_np[:, :-1].astype('float')
Y_large_np = large_np[:, -1].astype('str')
#print(X_large_np, Y_large_np)

FileNotFoundError: ignored

#Question 1-a

##Show the results of the gender prediction for the Evaluation data that is listed below generated training data for values of K of 1, 3, and 7. Include the intermediate steps (i.e., distance calculation, neighbor selection, and prediction).   

In [None]:
#Pre-Processing the Data

scaler = StandardScaling()
X_train = scaler.fit_transform(X_train_np)
X_test = scaler.transform(X_test_np)

### Using Cartesian Distance

In [None]:
knn_classifier = KNearestNeighborClassifier(logging=True)

#Fit training data to KNN classifier
knn_classifier.fit(X_train, Y_train_np)

## k = 1, 3, 7

In [None]:
for d_metric in ["cartesian", "manhattan", "minkowski"]:
  print("Distance metric used is: ", d_metric)
  for k in [1, 3, 7]:
      print("\t Number of Neighbors (k) is: ", k)
      knn_classifier.predict(X_test, k, distance_metric=d_metric) 
      print("\n")


# Question 1-c
## Leave One-Out Evaluation

In [None]:
err_dict = {}

for k in [1, 3, 5, 7, 9, 11]:
    crct = 0
    total = 0
    for i in range(X_large_np.shape[0]):
        tst_list = [i]
        trn_list = list(range(X_large_np.shape[0]))
        trn_list.remove(i)
        
        loo_X_train = np.take(X_large_np, trn_list, axis=0)
        loo_Y_train = np.take(Y_large_np, trn_list, axis=0)

        loo_X_test = X_large_np[tst_list]
        loo_Y_test = Y_large_np[tst_list]
        #print(loo_Y_test)

        scaler = StandardScaling()
        X_train = scaler.fit_transform(loo_X_train)
        X_test = scaler.transform(loo_X_test)

        knn_classifier = KNearestNeighborClassifier()

        #Fit training data to KNN classifier
        knn_classifier.fit(X_train, loo_Y_train)
        preds = knn_classifier.predict(X_test, k, distance_metric='cartesian') 
        if preds[0][0] == loo_Y_test[0]:
            crct += 1
        total += 1
    err_dict[k] = 100 - (100 * crct/total)

In [None]:
print("Error perctange for different k values are: ", err_dict)

## k = 7 gave lowest leave one out error value.

# Question 1-d
## Leave One-Out Evaluation removing age variable

In [None]:
X_woage = X_large_np[:, :-1]

In [None]:
err_dict = {}

for k in [1, 3, 5, 7, 9, 11]:
    crct = 0
    total = 0
    for i in range(X_woage.shape[0]):
        tst_list = [i]
        trn_list = list(range(X_woage.shape[0]))
        trn_list.remove(i)
        
        loo_X_train = np.take(X_woage, trn_list, axis=0)
        loo_Y_train = np.take(Y_large_np, trn_list, axis=0)

        loo_X_test = X_woage[tst_list]
        loo_Y_test = Y_large_np[tst_list]
        #print(loo_X_test.shape)

        scaler = StandardScaling()
        X_train = scaler.fit_transform(loo_X_train)
        X_test = scaler.transform(loo_X_test)

        knn_classifier = KNearestNeighborClassifier()

        #Fit training data to KNN classifier
        knn_classifier.fit(X_train, loo_Y_train)
        preds = knn_classifier.predict(X_test, k, distance_metric='cartesian') 
        if preds[0] == loo_Y_test[0]:
            crct += 1
        total += 1
    err_dict[k] = 100 - (100 * crct/total)

In [None]:
print("Error percentages for different k values without age variable are: ", err_dict)

## k = 9 gave lowest leave one out error value without age variable. Error percentages increased when the age variable is left out. This implies that age is a necessary variable that helps in predicting the gender accurately.

# Question 2-b
## Gaussian Naive Bayes Code

In [None]:
class GaussianNBClassifier:
    def __init__(self):
        self.prob_y = None
        self.unique_classes = None
        self.num_points = None
        self.num_features = None
        self.num_classes = None
        self.mean_param = None
        self.var_param = None

    def fit(self, X, y):
        self.classes = np.array(list(set(y)))
        self.num_classes = len(self.classes)
        self.num_points, self.num_features = X.shape
        self.prob_y = np.zeros(self.num_classes)
        self.mean_param = np.zeros((self.num_classes, self.num_features))
        self.var_param = np.zeros((self.num_classes, self.num_features))
        self.mle(X, y)

    def mle(self, X, y):
        for idx in range(0, self.num_classes):
            cls_count = len(np.where(y == self.classes[idx])[0])
            cls_idx = np.where(y == self.classes[idx])
            self.prob_y[idx] = cls_count/self.num_points
            self.mean_param[idx, :] = np.sum(X[cls_idx], axis=0)/cls_count
            self.var_param[idx, :] = np.sum(np.power(np.subtract(X[cls_idx], self.mean_param[idx]), 2), axis=0)/cls_count

    def gaussian_dist(self, X, mean, var):
        diff_vec = X - mean
        z_values = np.divide(X - mean, np.sqrt(var))
        dist = (1/np.sqrt(2*np.pi*var)) * np.exp(-0.5 * (z_values**2))
        return dist

    def get_log_bayes_prob(self, X):
        cond_prob = np.zeros((X.shape[0], self.num_classes))
        for i in range(X.shape[0]):
            for j in range(self.num_classes):
                cond_prob[i][j] = np.log(self.prob_y[j]) + np.sum(np.log(self.gaussian_dist(X[i], self.mean_param[j], self.var_param[j])))
        return cond_prob

    def predict(self, X):
        cond_prob = self.get_log_bayes_prob(X)
        max_index = np.argmax(cond_prob, axis=1)
        pred = self.classes[max_index]
        return pred

## Loading Datasets

In [None]:
training_data = '1a-training.txt'
test_data = '1a-test.txt'
large_120_data = '1c-data.txt'

train_np = readFile(training_data)
X_train_np = train_np[:, :-1].astype('float')
Y_train_np = train_np[:, -1].astype('str')
#print(X_train_np, Y_train_np)

test_np = readFile(test_data)
X_test_np = test_np.astype('float')
#print(X_test_np, Y_test_np)

large_np = readFile(large_120_data)
X_large_np = large_np[:, :-1].astype('float')
Y_large_np = large_np[:, -1].astype('str')
#print(X_large_np, Y_large_np)

#Question 2-a

##Learn/derive the parameters for the Gaussian Naive Bayes Classifier for the data from Question 2 a) and apply them to the same target as in problem 1a).   

In [None]:
gnb_classifier = GaussianNBClassifier()

#Fit training data to KNN classifier
gnb_classifier.fit(X_train_np, Y_train_np)

### Learned Paramaters

In [None]:
print("Learned Mean Parameters are: ", gnb_classifier.mean_param)
print("Learned Variance Parameters are: ", gnb_classifier.var_param)

Learned Mean Parameters are:  [[ 1.59911758 66.95767358 31.14285714]
 [ 1.6822001  75.8642998  32.14285714]]
Learned Variance Parameters are:  [[4.87896731e-03 1.50004970e+01 1.81224490e+01]
 [6.61414988e-04 1.12548981e+01 6.12244898e+00]]


### Test data predictions

In [None]:
preds = gnb_classifier.predict(X_test_np) 
print("Test data predictions are: ", preds)

Test data predictions are:  ['W' 'W' 'W' 'W']


# Question 2-c

In [None]:
crct = 0
total = 0
for i in range(X_large_np.shape[0]):
    tst_list = [i]
    trn_list = list(range(X_large_np.shape[0]))
    trn_list.remove(i)
    
    loo_X_train = np.take(X_large_np, trn_list, axis=0)
    loo_Y_train = np.take(Y_large_np, trn_list, axis=0)

    loo_X_test = X_large_np[tst_list]
    loo_Y_test = Y_large_np[tst_list]
    
    gnb_classifier = GaussianNBClassifier()

    #Fit training data to KNN classifier
    gnb_classifier.fit(loo_X_train, loo_Y_train)
    preds = gnb_classifier.predict(loo_X_test) 
    #print(preds)
    if preds[0] == loo_Y_test[0]:
        crct += 1
    total += 1
err_percentage = 100 - (100 * crct/total)

In [None]:
print("Error perctange for Gaussian Naive Bayes is: ", err_percentage)

Error perctange for Gaussian Naive Bayes is:  30.0


# Question 2-d
## Leave One-Out Evaluation removing age variable

In [None]:
X_woage = X_large_np[:, :-1]

In [None]:
crct = 0 
total = 0
for i in range(X_woage.shape[0]):
    tst_list = [i]
    trn_list = list(range(X_woage.shape[0]))
    trn_list.remove(i)
    
    loo_X_train = np.take(X_woage, trn_list, axis=0)
    loo_Y_train = np.take(Y_large_np, trn_list, axis=0)

    loo_X_test = X_woage[tst_list]
    loo_Y_test = Y_large_np[tst_list]

    gnb_classifier = GaussianNBClassifier()

    #Fit training data to KNN classifier
    gnb_classifier.fit(loo_X_train, loo_Y_train)
    preds = gnb_classifier.predict(loo_X_test) 
    if preds[0] == loo_Y_test[0]:
        crct += 1
    total += 1

err_percentage = 100 - (100 * crct/total)

In [None]:
print("Error percentage for gaussian naive bayes without age variable is: ", err_percentage)

Error percentage for gaussian naive bayes without age variable is:  29.16666666666667


## Error percentage remained similar when the age variable is left out. This implies that age is not a necessary variable that helps in predicting the gender accurately when using Gaussian Naive Bayes.

# Question 2-e

Based on the leave one out error, Gaussian Naive Bayes gave better results than the best K Nearest Neighbor classifier even when considering height, weight, and age variables or just considering height, and weight variables.