In [4]:
import pandas as pd
import numpy as np

In [16]:
dataset = pd.read_csv('xor.dat', header = None)

In [19]:
dataset

Unnamed: 0,0,1,2
0,0.0,0.0,0
1,1.0,0.0,1
2,0.0,1.0,1
3,1.0,1.0,0


In [147]:
def get_parameters(dataset):
    train = dataset.sample(frac=1)
    X = train.drop(2, axis=1)
    Y = train[2]

    # One hot encoding for Y
    Y = pd.get_dummies(Y)

    num_classes = Y.shape[1]
    num_features = X.shape[1]
    N = X.shape[0]

    # Weights 
    W = np.random.rand(num_features, num_classes)
    b = np.zeros(num_classes)
    
    return X, Y, W, b, N, num_classes, num_features

In [148]:
def softmax(logits):
    '''
        Computes the softmax function value
    '''
    activation = []
    for index in range(0, len(logits)):
        exp_sum = 0
        for val in logits[index]:
            exp_sum += np.exp(val)
        activation.append([np.exp(value) / exp_sum for value in logits[index]])
        
    return activation

In [149]:
pre_activation.shape

(300, 2)

In [150]:
def compute_pre_act(input_x, weights, bias):
    '''
        Compute pre activation i.e. f = X*W + b
    '''
    
    return np.dot(input_x, weights) + bias

In [164]:
#XT = np.transpose(X)
X, Y, W, b, N, num_classes, num_weights = get_parameters(dataset)

# Compute activation in forward propogation
pre_activation = compute_pre_act(X, W, b)

p = softmax(pre_activation)

p

[[0.2988471024610757, 0.7011528975389243],
 [0.3819850083703233, 0.6180149916296768],
 [0.40813935339684804, 0.5918606466031521],
 [0.5, 0.5]]

In [165]:
# Gradient of loss wrt pre activation Wx + b.
grad_loss = np.subtract(p, Y)
grad_loss

Unnamed: 0,0,1
3,-0.701153,0.701153
2,0.381985,-0.381985
1,0.408139,-0.408139
0,-0.5,0.5


In [166]:
# Gradient for weights = XT * (1/N)grad_loss + learn_rate * W
XT = np.transpose(X)
learn_rate = 0.001

W_grad = np.dot(XT, np.divide(grad_loss, N)) + np.multiply(W, learn_rate)

W_grad

# Gradient for bias = 1/N * summation ()

array([[-0.07265623,  0.07422221],
       [-0.07950053,  0.08056455]])

In [178]:
def compute_loss(post_act, num_instances, learning_rate, Weights, Y_values):
    '''
        Compute loss = -1/N * (summation(log(post_act[i,Yi]))) + (learning_rate / 2) * (summation((Weights[d, k]) ^ 2)
    '''
    # First part without constant
    log_summation = 0
    for i in range(0, len(post_act)):
        log_summation += np.log(post_act[Y_values[i]])
        
    # Second part without constant    
    weights_summation = 0
    for d in range(0, len(Weights)):
        for k in range(0, len(Weights[0])):
            weights_summation += np.square(Weights[d][k])
            
    loss = log_summation * (-1/num_instances) + (learning_rate/2) * weights_summation
    
    return loss

In [168]:
def gradient_checking(analytical_grad):
    '''
        Compute the numerical gradient for each weight and check if difference is < 1e-4 for all.
    '''
    epsilon = 0.001
    
    log_p = np.multiply(Y, np.log(p))
    
    # For each row / data point, get loss with weight + epsilon and weight - epsilon
    for y_ind in range(0, len(log_p.shape[1])):
        L_plus = compute_loss(p[y_ind], N, learn_rate, np.add(W, epsilon))
        L_minus = compute_loss(p[y_ind], N, learn_rate, np.subtract(W, epsilon), Y.iloc[y_ind])
    
        numerical_grad = 0
    
    if np.abs(numerical_grad - analytical_grad) < (1 * np.power(10, -4)):
        return True

In [194]:
np.subtract(W, 0.001)

array([[0.59615905, 0.9678216 ],
       [0.29044213, 0.77157348]])

In [191]:
for y_ind in range(0, len(Y)):
    print(compute_loss(p[y_ind], N, learn_rate, W, Y.iloc[y_ind]))

0.3917016366703075
0.36189263119864157
0.356146180384105
0.34756210153667344


3    1
2    0
1    0
0    1
Name: 0, dtype: uint8

In [154]:
# 1b
dataset_spiral = pd.read_csv('spiral_train.dat', header=None)

In [155]:
X, Y, W, b, N, num_classes, num_weights = get_parameters(dataset_spiral)

# Compute activation in forward propogation
pre_activation = compute_pre_act(X, W, b)

p = softmax(pre_activation)

In [156]:
# Gradient of loss wrt pre activation Wx + b.
grad_loss = np.subtract(p, Y)
grad_loss

Unnamed: 0,0,1,2
24,-0.676437,0.326065,0.350373
278,0.367151,0.361796,-0.728947
93,-0.617728,0.399298,0.218430
159,0.296441,-0.723896,0.427454
18,-0.678117,0.319030,0.359087
5,-0.669465,0.328544,0.340921
235,0.352386,0.366542,-0.718928
286,0.347930,0.324548,-0.672478
299,0.293485,0.257899,-0.551384
181,0.276691,-0.736109,0.459417


In [158]:
# Gradient for weights = XT * (1/N)grad_loss + learn_rate * W
XT = np.transpose(X)
learn_rate = 0.001

W_grad = np.dot(XT, np.divide(grad_loss, N)) + np.multiply(W, learn_rate)

W_grad

array([[-0.0457932 , -0.0532089 ,  0.10069507],
       [ 0.07510256, -0.11419095,  0.04004541]])

In [211]:
np.multiply(Y, np.log(p)).shape[1]

2

In [201]:
np.log(p)

array([[-1.2078232 , -0.3550293 ],
       [-0.96237392, -0.48124256],
       [-0.89614661, -0.52448407],
       [-0.69314718, -0.69314718]])