# HW02 - Naive Bayes' Classifier

### This homework solves multivariate classification data set with Naive Bayes' Classifier

In [1]:
#Import the libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math

def safelog(x):
    return(np.log(x + 1e-100))   #we define this in order to get rid of -inf, inf of np.log(0)

### 2 csv files were given which the images file contained 784 pixels (28*28) of images from 5 different classes. Thus, we loaded the csv of pixels of images and their corresponding labels.

In [2]:
#Read the csv files 
images = pd.read_csv('hw02_images.csv', header=None)
labels = pd.read_csv('hw02_labels.csv', header=None, names=['classes'])

In [3]:
#See sample of images data frame
images.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,255,255,255,255,255,255,255,250,255,255,...,255,255,255,225,212,255,255,255,255,255
1,255,255,255,254,253,255,255,255,255,255,...,252,255,255,255,255,254,255,255,255,255
2,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
3,255,255,255,250,251,250,250,252,250,249,...,248,247,248,251,252,248,250,255,255,255
4,255,255,255,255,255,255,255,255,255,255,...,241,255,255,255,255,255,255,255,255,255


### When data was analyzed, there were in total of 5 classes whose numbers were also starting from 1.

In [4]:
#Shows that there are 5 classes present
print('Number of classes are: {}'.format(np.max(labels)))
print('Minimum class is: {}'.format(np.min(labels)))

Number of classes are: classes    5
dtype: int64
Minimum class is: classes    1
dtype: int64


## Question 3:

### The data was splitted into training and test data sets which first 30000 rows were given to training data and the remaining 50000 images were given to test data. Both data sets contained 784 features.

In [5]:
#Splitting data into test and train datasets

x_train = images[:30000]
y_train = labels[:30000]
x_test = images[30000:]
y_test = labels[30000:]

In [6]:
print('Training data of x has {} rows and {} columns'.format(x_train.shape[0], x_train.shape[1]))
print('Training data of y has {} rows and {} columns'.format(y_train.shape[0], y_train.shape[1]))
print('Test data of x has {} rows and {} columns'.format(x_test.shape[0], x_test.shape[1]))
print('Test data of y has {} rows and {} columns'.format(y_test.shape[0], y_test.shape[1]))

Training data of x has 30000 rows and 784 columns
Training data of y has 30000 rows and 1 columns
Test data of x has 5000 rows and 784 columns
Test data of y has 5000 rows and 1 columns


## Question 4

### Number of classes was assigned to variable K in order to parametrize the code.

In [7]:
#Taking number of classes
K = np.max(labels['classes'])

### Parameters for the score function (sample means, sample deviation, and class sizes) were calculated.

In [8]:
#Calculating sample means

sample_means = np.array([x_train[y_train['classes']==i].mean() for i in range(1,K+1)])
print("Sample means: {}".format(sample_means))

Sample means: [[254.99866667 254.98416667 254.85616667 ... 254.679      254.87816667
  254.95933333]
 [254.99733333 254.99733333 254.9965     ... 254.96883333 254.99216667
  254.98866667]
 [254.99933333 254.99933333 254.99233333 ... 251.52483333 254.4725
  254.97483333]
 [254.99666667 254.98983333 254.91416667 ... 252.39516667 254.44166667
  254.93666667]
 [254.999      254.98433333 254.93783333 ... 250.673      253.23333333
  254.79083333]]


In [9]:
#Calculating sample deviations

sample_deviations = np.array([np.sqrt(((x_train[y_train['classes'] == i+1] - sample_means[i])**2).mean()) for i in range(K)])
print("Sample deviations: {}".format(sample_deviations))

Sample deviations: [[ 0.09127736  0.25609108  1.31090756 ...  5.29826629  3.9117332
   1.93959091]
 [ 0.2065419   0.2065419   0.2163818  ...  1.04076669  0.47057267
   0.70062226]
 [ 0.05163547  0.04081939  0.16002465 ... 18.43665868  6.7881694
   1.1061344 ]
 [ 0.18436076  0.21617116  1.81046936 ... 15.67799977  6.34549162
   1.79971911]
 [ 0.04471018  0.64582342  3.03248555 ... 23.62576428 13.9167006
   4.4727787 ]]


In [10]:
#Calculating class priors

class_priors = np.array([np.mean(y_train == (i)) for i in range(1,K+1)])
print("Class priors: {}".format(class_priors))

Class priors: [[0.2]
 [0.2]
 [0.2]
 [0.2]
 [0.2]]


## Question 5:

### With Naive Bayes' Classifier, score function can be calculated as: 
![title](equation.png)

In [11]:
#Turning df into an array
x_train_array = np.array(x_train)
x_test_array = np.array(x_test)

In [12]:
#Calculating the score function for train set

def score_func(x,sample_means,sample_deviations, class_priors):
    return [np.sum((-0.5*safelog(2*np.pi*(sample_deviations[i]**2)))
                  -(((x_train_array[c]-sample_means[i])**2)/(2*(sample_deviations[i]**2))))+ safelog(class_priors[i]) for i in range(K)]

###  Maximum value among the score functions was taken by argmax function.

In [13]:
#Assigning each row of dataset its prediction class with argmax function 

x_train_score = np.zeros((x_train_array.shape[0],K))

for c in range(x_train_array.shape[0]):
    x_train_score[c] = score_func(x_train,sample_means,sample_deviations, class_priors)

y_train_predicted = np.argmax(x_train_score, axis=1)+1
y_train_predicted

array([3, 1, 2, ..., 1, 3, 5])

### The confusion matrix for training data set included predictions for 5 classes and it gave accurate results.

In [14]:
#Creating confusion matrix for train set 
confusion_matrix = pd.crosstab(y_train_predicted, np.array(y_train).reshape(30000,), rownames = ['y_pred'], colnames = ['y_truth'])
confusion_matrix

y_truth,1,2,3,4,5
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3685,49,4,679,6
2,1430,5667,1140,1380,532
3,508,208,4670,2948,893
4,234,60,123,687,180
5,143,16,63,306,4389


## Question 6:

### Score function for test data set were calculated and argmax function was used in order to take the maximum score to determine the class the points belonged to.

In [15]:
#Calculating the score function for test set

def score_func(x,sample_means,sample_deviations, class_priors):
    return [np.sum((-0.5*safelog(2*np.pi*(sample_deviations[i]**2)))
                  -(((x_test_array[c]-sample_means[i])**2)/(2*(sample_deviations[i]**2))))+ safelog(class_priors[i]) for i in range(K)]

In [16]:
#Assigning each row of dataset its prediction class with argmax function 

x_test_score = np.zeros((x_test_array.shape[0],K))

for c in range(x_test_array.shape[0]):
    x_test_score[c] = score_func(x_test,sample_means,sample_deviations, class_priors)

y_test_predicted = np.argmax(x_test_score, axis=1)+1
y_test_predicted

array([1, 2, 5, ..., 3, 5, 5])

### The confusion matrix for test data set included predictions for 5 classes and it gave accurate results.

In [17]:
#Creating confusion matrix for test set
confusion_matrix = pd.crosstab(y_test_predicted, np.array(y_test).reshape(5000,), rownames = ['y_pred'], colnames = ['y_truth'])
confusion_matrix

y_truth,1,2,3,4,5
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,597,6,0,114,1
2,237,955,188,267,81
3,92,25,785,462,167
4,34,11,16,109,29
5,40,3,11,48,722
