### Q2

In [1]:
import cv2
import numpy as np
from cv2 import VideoWriter, VideoWriter_fourcc
import matplotlib.cm as cm
from collections import Counter
import glob
from matplotlib.image import imread
from enum import Enum
from scipy.linalg import svd
import scipy.stats as ss
import collections

# For testing
import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


imp_data = np.genfromtxt('spambase.data', delimiter=',')

def train_test_split(data, train_size, random_state):
    '''Splitting testing and training data'''

    # Resetting random seed
    np.random.seed(random_state)

    n = len(data)

    # Rows shuffled
    np.random.shuffle(data)

    # Calculates array index for splitting
    spltIdx = int(np.ceil((2/3)*n))

    # Training-validation data split
    data_train, data_test = data[:spltIdx,:], data[spltIdx:,:]

    # Training data
    x_tr, y_tr = np.hsplit(data_train, [-1])
    # Testing Data
    x_tt, y_tt = np.hsplit(data_test, [-1])



    # Separating class label from data
    class_label_tr = data_train[:, -1].astype(int)
    dataset_tr = data_train[:, :-1]

    class_label_tt = data_test[:, -1].astype(int)
    dataset_tt = data_test[:, :-1]

    # Filtering features with low std
    # dataset_tr = std_filter1(dataset_tr, 0)
    # dataset_tt = std_filter1(dataset_tt, 0)

    og_mean = np.mean(dataset_tr)
    og_std = np.std(dataset_tr)

    # dataset_tr = (dataset_tr - np.mean(dataset_tr)) / np.std(dataset_tr)
    # dataset_tt = (dataset_tt - np.mean(dataset_tt)) / np.std(dataset_tt)

    dataset_tr = (dataset_tr - og_mean) / og_std
    dataset_tt = (dataset_tt - og_mean) / og_std

    # x_tr = (x_tr - np.mean(x_tr)) / np.std(x_tr)
    # x_tt = (x_tt - np.mean(x_tt)) / np.std(x_tt)

    # return x_tr, y_tr, x_tt, y_tt
    return dataset_tr, class_label_tr, dataset_tt, class_label_tt
    # return dataset_tr, y_tr, dataset_tt, y_tt


def std_filter(data, std_val):
    '''Filters out features with low std'''

    dataset = np.copy(data)
    temp = 0
    while temp < dataset.shape[1]:
        if(np.std(dataset[:,temp]) == 0):
            dataset = np.delete(dataset, temp, 1)
            temp = temp - 1
        else:
            dataset[:,temp] = (dataset[:,temp] - np.mean(dataset[:,temp])) / np.std(dataset[:,temp])
            temp = temp + 1

    return dataset








In [2]:
class ClassifierEvaluation:
    def __init__(self, y_true, y_pred):
        self.y_true = y_true.astype(int)
        self.y_pred = y_pred.astype(int)
        unique, counts = np.unique(y_true, return_counts=True)
        self.y_true_dict = dict(zip(unique, counts))
        unique, counts = np.unique(y_pred, return_counts=True)
        self.y_pred_dict = dict(zip(unique, counts))
        

    def eval(self):
        self.TP = 0
        self.TN = 0
        self.FP = 0
        self.FN = 0

        for i in range(len(self.y_true)):
            if (self.y_true[i] == 1 and self.y_pred[i] == 1):
                self.TP += 1
            elif (self.y_true[i] == 1 and self.y_pred[i] == 0):
                self.FP += 1
            elif (self.y_true[i] == 0 and self.y_pred[i] == 1):
                self.FN += 1
            elif (self.y_true[i] == 0 and self.y_pred[i] == 0):
                self.TN += 1
            

    def get_precision(self):
        '''Precision = TP / (TP + FP)'''
        
        precision = self.TP/(self.TP + self.FP)
        return precision

    def get_recall(self):
        '''Precision = TP / (TP + FN)'''

        recall = self.TP/(self.TP + self.FN)
        return recall

    def get_fmeasure(self):
        '''Recall = (2 * Precision * Recall) / (Precision + Recall)'''

        fmeasure = (2 * self.get_precision() * self.get_recall())/(self.get_precision() + self.get_recall())
        return fmeasure

    def get_accuracy(self):
        '''Accuracy = (TP + TN) / (TP + TN + FP + FN)'''

        accuracy = (self.TP + self.TN) /(self.TP + self.TN + self.FP + self.FN)

        return accuracy

In [22]:
class NaiveBayes:
    def __init__(self, x, y):
        self.n_samples, self.n_features = x.shape
        self.classes = np.unique(y)
        self.n_classes = len(self.classes)
        self.features = x
        self.target = y.flatten()
        
        self.mean_data = np.zeros((self.n_classes, self.n_features), dtype=np.float64)
        self.std_data = np.zeros((self.n_classes, self.n_features), dtype=np.float64)
        self.prior_data = np.zeros(self.n_classes)
        

    def get_target(self):
        '''For debugging purposes'''

        print(np.mean(self.mean_data.flatten()))
        print(np.mean(self.std_data.flatten()))
        print(self.target.shape)
        print(self.features.shape)
        return self.target


    def fit(self):
        '''Separates spam and not spam rows'''

        data = self.features
        label = self.target

        label = label.reshape(label.shape[0], 1)

        spIdx_lst = np.where(~label.any(axis=1))[0]
        notIdx_sp_lst = np.where(label.any(axis=1))[0]

        d_list = data.tolist()
        sp_list = []
        not_sp_list = []

        for index in spIdx_lst:
            sp_list += [d_list[index]]

        for index in notIdx_sp_lst:
            not_sp_list += [d_list[index]]


        sp_data = np.asarray(sp_list)
        not_sp_data = np.asarray(not_sp_list)


        self.mean_data[0, :] = sp_data.mean(axis=0)
        self.std_data[0, :] = sp_data.std(axis=0)
        self.prior_data[0] = sp_data.shape[0] / float(self.n_samples)

        self.mean_data[1, :] = not_sp_data.mean(axis=0)
        self.std_data[1, :] = not_sp_data.std(axis=0)
        self.prior_data[1] = not_sp_data.shape[0] / float(self.n_samples)


    def get_stats(self):
        '''For debugging purposes'''

        return self.mean_data, self.std_data, self.prior_data


    def calc_posterior(self, x):
        '''Chooses the class label based on which class probability is higher'''

        posteriors = []

        for i in range(self.n_classes):
            prior = self.prior_data[i]
            n_pdf = self.norm_pdf(x, i)
            n_pdf = np.prod(np.nan_to_num(n_pdf, nan=10^-8, posinf=10^8, neginf=10^-8))
            posterior = prior * n_pdf
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]


    def predict(self, x):
        '''Gets the predicted target values'''

        preds = [self.calc_posterior(i) for i in x]
        return np.asarray(preds, dtype=np.float64)


    def norm_pdf(self, data, c_idx):
        '''Calculates norm pdf'''

        mean = self.mean_data[c_idx]
        std = self.std_data[c_idx]

        numerator = np.exp(- (data-mean)**2 / (2 * (std**2)))
        denominator = std * np.sqrt(2 * np.pi)

        return numerator / denominator

In [11]:
x_tr, y_tr, x_tt, y_tt = train_test_split(imp_data, train_size=2/3, random_state=0)

# np.set_printoptions(threshold=sys.maxsize)
# np.set_printoptions(threshold = False)

g_nb = NaiveBayes(x_tr, y_tr)
g_nb.fit()
# print(g_nb.get_stats())
predictions = g_nb.predict(x_tt)
gb_ce = ClassifierEvaluation(y_tt, predictions)
gb_ce.eval()
print(f"Precision: {gb_ce.get_precision() * 100}%")
print(f"Recall: {gb_ce.get_recall() * 100}%")
print(f"F-measure: {gb_ce.get_fmeasure() * 100}%")
print(f"Accuracy: {gb_ce.get_accuracy() * 100}%")

Precision: 95.65217391304348%
Recall: 67.85290628707%
F-measure: 79.38931297709924%
Accuracy: 80.62622309197651%


### Q3

In [3]:
import math

class LogisticRegression:
    def __init__(self, lr=1):
        self.lr = lr
        self.precision = 10^-20
        self.max_iters = 100000


    def fit(self, x, y):

        biasF = np.ones(((len(x)), 1))
        x = np.hstack((biasF, x))
        
        self.features = x.T
        self.target = y.T
        self.target = self.target.reshape(1, self.target.shape[0])
        n, m = self.features.shape


        self.W = np.zeros((n, 1))
        cost = 0
        step_size_c = 1
        c_change = 0
        epoch = 0

        # while step_size_c > self.precision:
        for i in range(self.max_iters):
        
            Z = np.dot(self.W.T, self.features)
            A = self.sigmoid(Z)

            prev_cost = cost
            cost = self.cost_func(m, A, self.target)

            prev_change = c_change
            c_change = abs(cost - prev_cost)

            if c_change > prev_change:
                self.lr = self.lr/2

            dW = (1/m)*np.dot(A - self.target, self.features.T)
            
            self.W = self.W - self.lr * dW.T

            step_size_c = abs(c_change)
            epoch += 1

            if epoch % 10000 == 0:
                print("EPOCH:", epoch)


    def gradient_ascent(self, X, h, y):
        return np.dot(X.T, y - h)

    def cost_func(self, m, A, Y):

        Y = np.nan_to_num(Y, nan=10^-8, posinf=10^8, neginf=10^-12)
        A = np.nan_to_num(A, nan=10^-8, posinf=10^8, neginf=10^-12)
        sub_logA = np.nan_to_num(np.log(1-A), nan=10^-8, posinf=10^8, neginf=10^-12)
        sub_Y = np.nan_to_num(1 - Y, nan=10^-8, posinf=10^8, neginf=10^-12)

        cost = -(1/m)*np.sum( Y * np.log(A) + sub_Y * sub_logA)
        return cost


    def predict(self, x):

        biasF = np.ones(((len(x)), 1))
        x = np.hstack((biasF, x))

        x = x.T
        n, m = x.shape
        
        Z = np.dot(self.W.T, x)
        A = self.sigmoid(Z)
        A = A > 0.5
        return np.array(A, dtype = 'int64').flatten()

        
    def sigmoid(self, x):
        return 1/(1 + np.exp(- x))


    def get_stats(self):
        return self.features


x_tr, y_tr, x_tt, y_tt = train_test_split(imp_data, train_size=2/3, random_state=0)

# np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(threshold = False)

g_lr = LogisticRegression()
g_lr.fit(x_tr, y_tr)
predictions = g_lr.predict(x_tt)
lr_ce = ClassifierEvaluation(y_tt, predictions)
lr_ce.eval()
print(f"Precision: {lr_ce.get_precision() * 100}%")
print(f"Recall: {lr_ce.get_recall() * 100}%")
print(f"F-measure: {lr_ce.get_fmeasure() * 100}%")
print(f"Accuracy: {lr_ce.get_accuracy() * 100}%")

  sub_logA = np.nan_to_num(np.log(1-A), nan=10^-8, posinf=10^8, neginf=10^-12)
EPOCH: 10000
EPOCH: 20000
EPOCH: 30000
EPOCH: 40000
EPOCH: 50000
EPOCH: 60000
EPOCH: 70000
EPOCH: 80000
EPOCH: 90000
EPOCH: 100000
Precision: 43.92361111111111%
Recall: 77.37003058103976%
F-measure: 56.03543743078626%
Accuracy: 74.1030658838878%


In [29]:
print("I f****** give up")

I f****** give up


### Q1

In [5]:
imp_data = np.genfromtxt('q1_2.csv', delimiter=',')

x_tr, y_tr = np.hsplit(imp_data, [-1])
og_mean = np.mean(x_tr)
og_std = np.std(x_tr)
x = (x_tr - og_mean) / og_std

label = y_tr
n_samples, n_features = x.shape
classes = np.unique(y_tr)
n_classes = len(classes)

mean_data = np.zeros((n_classes, n_features), dtype=np.float64)
std_data = np.zeros((n_classes, n_features), dtype=np.float64)
prior_data = np.zeros(n_classes)
label = label.reshape(label.shape[0], 1)

noIdx_lst = np.where(~label.any(axis=1))[0]
yesIdx_lst = np.where(label.any(axis=1))[0]


d_list = x.tolist()
no_list = []
yes_list = []

for index in noIdx_lst:
    no_list += [d_list[index]]

for index in yesIdx_lst:
    yes_list += [d_list[index]]


no_data = np.asarray(no_list)
yes_data = np.asarray(yes_list)


mean_data[0, :] = no_data.mean(axis=0)
std_data[0, :] = no_data.std(axis=0)
prior_data[0] = no_data.shape[0] / float(n_samples)

mean_data[1, :] = yes_data.mean(axis=0)
std_data[1, :] = yes_data.std(axis=0)
prior_data[1] = yes_data.shape[0] / float(n_samples)

print(mean_data)
print(std_data)
print(prior_data)
print()
print(x)

x_tt = np.asarray([[242, 4.56]], dtype=np.float32)
x_test = (x_tt - og_mean) / og_std
print(x_test)



# x_tr, y_tr, x_tt, y_tt = train_test_split(imp_data, train_size=3/3, random_state=0)

# # np.set_printoptions(threshold=sys.maxsize)
# # np.set_printoptions(threshold = False)

# g_nb = NaiveBayes(x_tr, y_tr)
# g_nb.fit()
# g_nb.get_stats()
# # print(g_nb.get_stats())
# predictions = g_nb.predict(x_tt)
# gb_ce = ClassifierEvaluation(y_tt, predictions)
# gb_ce.eval()
# print(f"Precision: {gb_ce.get_precision() * 100}%")
# print(f"Recall: {gb_ce.get_recall() * 100}%")
# print(f"F-measure: {gb_ce.get_fmeasure() * 100}%")
# print(f"Accuracy: {gb_ce.get_accuracy() * 100}%")

[[ 1.75949201 -0.74870233]
 [ 0.06547994 -0.73933973]]
[[0.33151634 0.00688534]
 [0.52104341 0.00759711]]
[0.4 0.6]

[[ 0.80137336 -0.7310336 ]
 [-0.26967943 -0.73759107]
 [ 1.42797567 -0.75558767]
 [-0.3352541  -0.74939451]
 [ 2.09100835 -0.74181699]]
[[ 0.9908112  -0.73919404]]
