Loading Iris Dataset from Scikit-learn 

In [202]:
from sklearn import datasets

In [203]:
iris = datasets.load_iris()

In [204]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [205]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [206]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

 Creating X and y variables

In [207]:

X = iris['data'][:,(2,3)] # Petal length and Petal width
y = iris['target']

addind bias term to every instance

In [208]:

import numpy as np

X_with_bias = np.c_[np.ones([len(X),1]),X] 

In [209]:
np.random.seed(2042)

Splitting the data into train, Validation and test set

In [210]:
#Splitting the data into train, Validation and test set
test_ratio = 0.2
val_ratio = 0.2
total_size = len(X_with_bias)

test_size = int(total_size*test_ratio)

val_size = int(total_size*val_ratio)

train_size = total_size-test_size-val_size
train_size

90

In [211]:
random_indexs = np.random.permutation(total_size)

X_train = X_with_bias[random_indexs[:train_size]]
y_train = y[random_indexs[:train_size]]
X_valid = X_with_bias[random_indexs[train_size:-test_size]]
y_valid = y[random_indexs[train_size:-test_size]]
X_test =  X_with_bias[random_indexs[-test_size:]]
y_test =  y[random_indexs[-test_size:]]


function to convert the vector of class indices into a matrix containing a one-hot vector for each instance:

In [212]:
#
def to_one_hot(y):
    n_classes=max(y)+1
    m=len(y)
    y_new = np.zeros((m,n_classes))
    y_new[np.arange(m),y] = 1
    return y_new

In [213]:
to_one_hot(y)

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0

In [214]:
y_train_one_hot = to_one_hot(y_train)
y_valid_one_hot = to_one_hot(y_valid)
y_test_one_hot = to_one_hot(y_test)

Implementing Softmax function

In [215]:
def softmax(logits):
    exps = np.exp(logits)
    exps_sum=np.sum(exps,axis=1,keepdims=True)
    return exps/exps_sum

Training the model

In [216]:
eta = 0.01
n_iterations = 5001
epsilon = 1e-7
m=len(y)

n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)
n_outputs = len(np.unique(y_train))   # == 3 (3 iris classes)

Theta = np.random.randn(n_inputs, n_outputs)
 
for iteration in range(n_iterations):
    logits=X_train.dot(Theta)
    y_prob=softmax(logits)
    loss=-np.mean(np.sum(y_train_one_hot*np.log(y_prob+epsilon),axis=1))
    error=y_prob-y_train_one_hot
    if iteration%500 == 0:
                  print(iteration,loss)
    gradient=1/m*(X_train.T.dot(error))
    Theta=Theta-eta*gradient
    

0 5.446205811872683
500 0.9237846868880052
1000 0.7982644354884092
1500 0.711013522022268
2000 0.6482508900193986
2500 0.6012514121730151
3000 0.5646978938728316
3500 0.5353227419185083
4000 0.5110622700454638
4500 0.49056904383315625
5000 0.472933297880902


model parameters

In [217]:
Theta

array([[ 2.57558577, -0.64284974, -2.26169882],
       [-1.03422341, -0.00578853,  0.09030258],
       [-0.414509  ,  0.128879  ,  0.96674767]])

Predictions on Validation data

In [218]:
logits = X_valid.dot(theta)
y_proba = softmax(logits)
y_predict = np.argmax(y_proba,axis=1) 
accuracy = np.mean(y_predict==y_valid)

In [219]:
accuracy

0.8

Adding l2 regularization to the Algorithm

In [220]:
eta = 0.01
n_iterations = 5001
epsilon = 1e-7
m=len(y)
alpha=0.1 #Reguralization hyperparameter

n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)
n_outputs = len(np.unique(y_train))   # == 3 (3 iris classes)

Theta = np.random.randn(n_inputs, n_outputs)
 
for iteration in range(n_iterations):
    logits=X_train.dot(Theta)
    y_prob=softmax(logits)
    entropy_loss=-np.mean(np.sum(y_train_one_hot*np.log(y_prob+epsilon),axis=1))
    l2_loss=1/2*np.sum(np.square(Theta[1:]))
    loss=entropy_loss+alpha*l2_loss
    error=y_prob-y_train_one_hot
    if iteration%500 == 0:
                  print(iteration,loss)
    gradient=1/m*(X_train.T.dot(error))+np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta=Theta-eta*gradient
    

0 6.629842469083912
500 1.0265116608936835
1000 0.8561252304832059
1500 0.7624404750776342
2000 0.7053236628856812
2500 0.6673625976657003
3000 0.6404085039083317
3500 0.6203157319624816
4000 0.6047893204262049
4500 0.5924574286127462
5000 0.582446258278903


Prediction on validation data and accuracy

In [221]:
logits = X_valid.dot(Theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

0.8666666666666667

Adding early stopping to the model

In [222]:
eta = 0.1
n_iterations = 5001
epsilon = 1e-7
m=len(X_train)
alpha=0.1
best_loss = np.infty

n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)
n_outputs = len(np.unique(y_train))   # == 3 (3 iris classes)

Theta = np.random.randn(n_inputs, n_outputs)
 
for iteration in range(n_iterations):
    logits=X_train.dot(Theta)
    y_prob=softmax(logits)
    entropy_loss=-np.mean(np.sum(y_train_one_hot*np.log(y_prob+epsilon),axis=1))
    l2_loss=1/2*np.sum(np.square(Theta[1:]))
    loss=entropy_loss+alpha*l2_loss
    error=y_prob-y_train_one_hot
    gradient=1/m*X_train.T.dot(error)+np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta=Theta-eta*gradient
    
    
    logits=X_valid.dot(Theta)
    y_prob=softmax(logits)
    entropy_loss=-np.mean(np.sum(y_valid_one_hot*np.log(y_prob+epsilon),axis=1))
    l2_loss=1/2*np.sum(np.square(Theta[1:]))
    loss=entropy_loss+alpha*l2_loss
    if iteration%500 == 0:
                  print(iteration,loss)
    if loss < best_loss:
        best_loss=loss
        
    else:
        print(iteration - 1, best_loss)
        print(iteration, loss, "early stopping!")
        break

0 4.7096017363419875
500 0.5739711987633519
1000 0.5435638529109127
1500 0.5355752782580262
2000 0.5331959249285544
2500 0.5325946767399383
2765 0.5325460966791898
2766 0.5325460971327975 early stopping!


prediction on validation data and accuracy/

In [223]:
logits = X_valid.dot(Theta)
y_proba = softmax(logits)

y_predict = np.argmax(y_proba,axis=1)
accuracy = np.mean(y_predict == y_valid)

In [224]:
accuracy

1.0

Final predictions on test data and accuracy

In [226]:
logits = X_test.dot(Theta)
y_proba = softmax(logits)

y_predict = np.argmax(y_proba,axis=1)
accuracy = np.mean(y_predict == y_test)

In [227]:
accuracy

0.9333333333333333