# PART I: Theory Questions

<img src="1_2_3.png">
<img src="4_5.png">

In [36]:
import numpy as np


class NeuralNetwork:
    def sigmoid(self, x):
        return 1.0 / (1 + np.exp(-x))

    def sigmoid_derivation(self, x):
        return x * (1 - x)

    def ReLu(self, x):
        x[x < 0] = 0
        return x

    def ReLu_derivation(self, x):
        x[x > 0] = 1
        x[x <= 0] = 0
        return x

    def mse(self, error):
        return np.sum(np.square(error)) / len(error)

    def softmax(self, x):
        return np.exp(x) / np.sum(np.exp(x))

    def neg_log_likelihood(self, output, y):  # cross entropy
        r = ((y * np.log10(output) + (1 - y) * np.log10(1 - output)) * -1).sum()
        return r

    def der_cross_entropy(self, output, y):
        return output - y

    def tanh(self,x):
        return np.tanh(x)

    def der_tanh(self,x):
        return 1/np.square((np.cosh(x)))

    def __init__(self, input_layers=900, hidden_layers=[], output_layers=6,
                 activation_func=sigmoid, der_func=sigmoid_derivation, loss_func=neg_log_likelihood):
        self.input_layers = input_layers
        self.hidden_layer = hidden_layers  # when this is empty array it means single layer
        self.output_layers = output_layers

        self.activation_func = activation_func
        self.der_activation_func = der_func
        self.loss_func = loss_func

        total_layers = [input_layers] + hidden_layers + [output_layers]

        weights = list()  # weights are stored for forward and backprop
        for i in range(len(total_layers) - 1):
            # we create an initial weights.
            # weight shape is (900,6) each feature fully connected to the output
            weight = np.random.rand(total_layers[i], total_layers[i + 1]) / 1000
            # assume that we have input=900 1 hidden=100 output=6
            #shape for 1 hidden layer is: first weight is [input layer size=900,first hidden layer=100]
                                    # second weight is [first hidden layer=100, output layer size=6]
            weights.append(weight)
        self.weights = weights

        biases = list()
        for i in range(len(total_layers) - 1):
            bias = np.random.rand(total_layers[i + 1]) / 1000
            #shape for 1 hidden layer is: first bias is [first hidden layer=100]
                                    # second bias is [output layer size=6]
            biases.append(bias)
        self.bias = biases

        activation_outputs = list()  # sigmoid outputs are stored for backprop
        for i in range(len(total_layers)):
            activation_outputs.append(np.zeros(total_layers[i]))
        self.activation_outputs = activation_outputs

        derivation_outputs = list()  # derivations are stored for backprop
        for i in range(len(total_layers) - 1):
            derivation_outputs.append(np.zeros((total_layers[i], total_layers[i + 1])))
        self.derivation_outputs = derivation_outputs

        bias_deltas = list() #for bias updates
        for i in range(len(total_layers) - 1):
            bias_deltas.append(np.zeros(total_layers[i + 1]))
        self.bias_deltas = bias_deltas

    def forward(self, inputs):
        # input has 900 feature
        # activation(3)= sig(h2)
        # h2=a(2) x w2 and so on
        activations = inputs
        self.activation_outputs[0] = activations  # first activation is the input
        for i in range(len(self.weights)):
            h1 = np.dot(activations, self.weights[i])
            h1 += self.bias[i]

            activations = self.activation_func(self, h1)
            self.activation_outputs[i + 1] = activations  # first weight calculates the second activation output

        return self.softmax(activations)

    def back_propagation(self, error):
        # yhat-y=error[0,-1,0,0,1,0]
        for i in range(len(self.derivation_outputs) - 1, -1, -1):
            activation_outputs = self.activation_outputs[i + 1]  # these are outputs of layers
            delta = error * self.der_activation_func(self, activation_outputs)  # calculate delta
            transposed_delta = delta.reshape(delta.shape[0], -1).T  # take transpose to calculate derivation
            self.bias_deltas[i] = delta  # bias deriv value

            activation_outputs2 = self.activation_outputs[i]  # this is previous layers output
            activation_outputs2 = activation_outputs2.reshape(activation_outputs2.shape[0], -1)
            self.derivation_outputs[i] = np.dot(activation_outputs2, transposed_delta)  # get the derivation of weight
            error = np.dot(delta, self.weights[i].T)

        return error

    def gradient_descent(self, learning_rate):  # update values
        for i in range(len(self.weights)):
            w = self.weights[i]
            derivation = self.derivation_outputs[i]
            w -= derivation * learning_rate
            self.weights[i] = w

            self.bias[i] -= self.bias_deltas[i] * learning_rate


    def train2(self, train_set, epochs=250, learning_rate=0.05, batch_size=4,decay_rate=0.99):
        for i in range(epochs):
            total_error = 0
            e = 0
            j = 0
            error = 0
            for input, target in train_set:
                output = self.forward(input)
                y = np.array([1. if x == target else 0. for x in range(len(output))])
                error += self.der_cross_entropy(output,y)
                loss = self.loss_func(self, output, y)
                total_error += loss
                if (j + 1) % batch_size == 0:
                    self.back_propagation(error / batch_size)
                    self.gradient_descent(learning_rate)
                    error = 0
                elif np.argmax(output) == target:
                    e += 1
                j += 1
            learning_rate *= decay_rate # decay rate
            if (i+1) % 50 == 0:
                print(e, e / 14034)
                print("Error: {} at epoch {}".format(total_error / len(train_set), (i+1)))

                
                
                
                
import os
import pickle

import cv2
import numpy as np
from sklearn.preprocessing import StandardScaler

from Layer import NeuralNetwork


def read_and_store(path="data/seg_train/seg_train"):
    t_set = list()
    label_counter = 0  # 0->buildings
    # 1-> forest and so on
    for train_folder in os.listdir(path):
        folder = path + "\\" + train_folder
        for filename in os.listdir(folder):
            img = cv2.imread(folder + "\\" + filename, 0).astype(np.uint8)  # gray img
            img = cv2.resize(img, (30, 30), interpolation=cv2.INTER_AREA)
            img = (img - np.min(img)) / (np.max(img) - np.min(img))  # Normalization
            out = img.flatten()  # to obtain 1d image array
            t_set.append([out, label_counter])  # out is 900,1 vector and train_folder is label
        label_counter += 1
    return t_set


def read_test():
    test = list()
    folder = "data/seg_test"
    for filename in os.listdir(folder):
        img = cv2.imread(folder + "\\" + filename, 0).astype(np.uint8)  # gray img
        img = cv2.resize(img, (30, 30), interpolation=cv2.INTER_AREA)
        img = (img - np.min(img)) / (np.max(img) - np.min(img))  # Normalization
        out = img.flatten()  # to obtain 1d image array
        test.append([out])
    return test

def calculate_acc(NN,train_set,validation_set):
    e = 0
    c = 0
    for i in range(len(train_set)):
        o = NN.forward(train_set[i][0])
        error = train_set[i][1] - np.argmax(o)
        if error == 0:
            c += 1
    print("Accuracy of Train set is {:.2f}%".format(c * 100 / len(train_set)))
    print()

    e = 0
    c = 0
    for i in range(len(validation_set)):
        o = NN.forward(validation_set[i][0])
        error = validation_set[i][1] - np.argmax(o)
        if error == 0:
            c += 1
    print("Accuracy of Validation set is {:.2f}%".format(c * 100 / len(validation_set)))
    

def process(train_set,validation_set):
    act_function = [[NeuralNetwork.sigmoid, NeuralNetwork.sigmoid_derivation, "sigmoid"],
                [NeuralNetwork.tanh, NeuralNetwork.der_tanh, "tanh"]]
    hid_lay_list = [[], [100], [10, 6]] #first element does not contain, second contains 1 hidden layer,
                                    #third contains 2 hidden layers
    lr_list = [0.01, 0.05]
    batch_list = [2, 4]
    counter = 1

    for hidden_size in hid_lay_list:
        for act, der_act, a in act_function:
            for lr in lr_list:
                for b_s in batch_list:
                    """NN = NeuralNetwork(hidden_layers=hidden_size, activation_func=act,
                                       der_func=der_act)
                    np.random.shuffle(train_set)
                    NN.train2(train_set, 200, lr, batch_size=b_s)
                    pickle.dump(NN, open(("model" + str(counter)), "wb"))

                    """
                    NN=pickle.load(open("model"+str(counter),"rb"))
                    counter += 1


                    calculate_acc(NN,train_set,validation_set)
                    print(
                        "Model {} Hidden layer {}, Activation function {}, Epoch is 200, Learning Rate is {}, Batch size is {} ".format(
                            counter - 1, hidden_size, a, lr, b_s))
                    print("------------------------------------------------------------------------------")
                    print()


"""train_set = read_and_store(path="data/seg_train/seg_train")
test_set = read_test()
validation_set = read_and_store(path="data/seg_dev/seg_dev")

pickle.dump(train_set,open("train_set", 'wb'))
pickle.dump(test_set,open("test_set", 'wb'))
pickle.dump(validation_set,open("validation", 'wb'))
exit(-1)"""

with open("train_set", "rb") as f:
    train_set = pickle.load(f)
with open("test_set", "rb") as f:
    test_set = pickle.load(f)

with open("validation", "rb") as f:
    validation_set = pickle.load(f)
list=["model1_[]_0.001_2_sig","model5_[100]_0001_2_sig","model3_[10_6]_005_2_sig"]
for i in list:    
    with open(i,"rb") as f:
        NN=pickle.load(f)
        print()
        calculate_acc(NN,train_set,validation_set)
        print("-------------------------------------------")




Accuracy of Train set is 39.31%

Accuracy of Validation set is 38.67%
-------------------------------------------
Accuracy of Train set is 45.12%

Accuracy of Validation set is 43.80%
-------------------------------------------
Accuracy of Train set is 35.78%

Accuracy of Validation set is 34.00%
-------------------------------------------


In [None]:

process(train_set,validation_set)

# PART II: Classification of Natural Scenes using Neural Network


- Many model was created with different hyperparameters. They are stored in pickle files. The code in here used them. Only one Neural Network code is implemented for three types of NN(single layer, 1 hidden layer and 2 hidden layer).

## About dataset:

  - Data has read one by one, resized 30x30 after that Normalization ((X-x_min)/(x_max-x_min)) was applied. Finally it was converted to 900,1 vector.
    - Train set contains [vector,label] vector has 900,1 feature dimension.
    
    
## About Neural Network:
- My code default initializes with single layer neural network. Default weight and bias parameters are created with random small values. Then there are 3 backpropagation parameters are defined.(activation_outputs, derivation_outputs, bias_deltas) 
- There are two activation functions were used (tanh and sigmoid), and sigmoid gives better results.

- In general my code above does not give good results.(All three types of network gives around 30% - 40% accuracy.) Even though normalization and avoiding overflow and underflow results are not very good. Nevertheless my network is learning.
- About 1 hidden layer neural network: If it contains 100 node in hidden layer it gives 43.8% accuracy. Of course, when the number of nodes are increased, the accuracy also increases. but it does not worth the cost spent.

## About forward propagation:
    - First activation output is input, it is stored for backpropagation to update the weights and bias. Each time we calculate the next layers input and take its activation output. At the end we returned the softmax function.
    
## About back propagation:
     - The main objective of the project is here. It is important to update all the weights and bias values is cumbersome and took my whole week. When we calculate mini batch error we update the whole weights by derivative of the activation function. In my code activation-outputs are explained in forward.
     - After that we update the weights and bias values by the calculated derivation outputs.

#### About activation functions:
 - I firstly wanted to use ReLu, and I already implement it. But it does not converge and i searched for it. In many explanation it is accepted that ReLu is better than both sigmoid and tanh, but it is only one side. For the negative side it is the worst. Some called this dying ReLu.
 - I chose sigmoid and tanh activation functions. Sigmoid gives slightly better results when I chose small batch size(2,4), when I chose large batch size such as 128 sigmoid and tanh gave me the same accuracy and loss. After each foward propagation I returned softmax.
 
 <hr><br>
 
#### Before starting analysis:
 - Cross entropy(negative log likelihood) was used as loss funciton. Sum of squared used but it does not gives me accurate results and i removed them. Only cross entropy loss will be shown.
    
  - Smaller learning rate such as 0.01 with decay rate 0.99 gives better results than others. It ıs because of oscillation. Learning rate is decresed after each epoch.
  - The main criteria of improving the results is that when the batch size is 1 or near to 1 it gives the highest accuracy. But in the plotting I will show the high batch size. (Because I have no time and other lectures have also projects)

 - I can not find the optimal hidden layers for 2 hidden layer. So I chose smaller and  
 
- Note: The plots below are shown with best accuracy results. For example the highest sigmoid vs the highest tanh.
    
### Activation functions :
    - Sigmoid:
<img src="m1.png">
    
    - Tanh:
 <img src="m2.png">

As you can see the above graph only difference is the activation function and the results are better for sigmoid. Both uses the softmax and cross entropy and 100 epoch. 
 - Loss in sigmoid converges much faster than tanh activation function.
 - As you can see in the tanh it is affected very much from outliers. 
 - If we change the epoch size and other hyperparameters for optimizing tanh we may obtain better loss and accuracy.
 - Validation converges to good results after 20. epoch in sigmoid.

But sigmoid and tanh have the vanishing gradient problem.

###  Layers:
         - Single Layer:
  <img src="single.png">       
         - One hidden layer:
  <img src="1_hid.png">
         - Two hidden layer:
<img src="two_h.png">

All of them uses the same decay rate as lr*=0.99

Single layer has 38% accuracy and it is really good for 100 epoch and it does not overfit. Maybe it reach to 70% accuracy with some optimization with hyperparameters but single layer only solves the linearly seperable functions. So that whatever single layer has 99% accuracy this is nothing for real life problems.

One hidden layer implementation gives the best results. 100 node used in here but if we give more nodes on hidden layer such as 180 nodes accuracy increases slightly. But it does not worth that for computation cost.

Last one contain 2 hidden layer first has 10 nodes and second hidden contains 6 nodes, and it gives good result and does not overfit.

### About Loss and Learning Rate:
    - In any learning rate such as 0.1,0.01,0.005,0.05 and so on, they always converges to a good or a bad value. But in a different speed. 
    
### Finally:
- The results are really bad. But it can be optimized with changing hyperparameters. Choosing different loss function and using softmax did not change result so much.  But batch size affects everything, if batch size is 1 it gives the best accuracy but i did not use 1.(it is not mini batch).
- Increasing epoch size can be a good option but i do not think it will reach to 70% accuracy.