# Import

In [1]:
import numpy as np
import pandas as pd
import math
from sklearn import datasets
from sklearn.model_selection import train_test_split

# Implement

In [2]:
class Softmax_Regression:
    def __init__(self, alpha=0.0001, max_iter=1000, l2=0.01, tol = 1e-4):
        ''' 
        Class constructor
        
        Parameters
        ----------
        alpha: the learning rate determines how big the step would be on each iteration.
        max_iter: number of times update weight
        l2: l2 regularization
        tol: weight threshold changes to stopping.
        '''
        self.alpha = alpha
        self.max_iter = max_iter
        self.l2 = l2
        self.tol = tol
        self.weight = None
    
    def softmax(self, z):
        '''
        Computes softmax function for each row of array S.
        '''
        A = np.exp(z - np.max(z, axis = 1, keepdims = True))
        A /= A.sum(axis = 1, keepdims=True)

        return A
    
    def one_hot_encoding(self, Y, num_classes):
        '''
        Y to one hot encoding
        
        Parameters
        ---------
        y: numpy array, shape (m, 1) 
        The vector of outputs.
        
        Return
        y: numpy array, shape (m, num_classes) 
        '''
        one_hot_Y = np.zeros((len(Y), num_classes))
        one_hot_Y[np.arange(len(Y)), Y.reshape(-1)] = 1
        return one_hot_Y
    
    def fit(self, X, y):
        '''
        Trains Softmax Regression on the dataset (X, y) using Stochastic Gradient Descent.
        
        Parameters
        ----------
        X : numpy array, shape (m, n)
        The matrix of inputs
        y : numpy array, shape (m, 1) 
        The vector of outputs.
        '''
            
        # First column of this matrix is all ones (corresponding to x_0).
        X = np.append(np.ones((X.shape[0], 1)), X, axis=1)
        m, n = X.shape
        
        # Initialize weights with shape (n + 1, num_classes) 
        np.random.seed(0) 
        limit = 1 / math.sqrt(n)
        W = [np.random.uniform(-limit, limit, (n, len(np.unique(y))))]
        
        # Checking for changes in weight after 20 iter 
        check_w_after = 20
        
        y = self.one_hot_encoding(y, len(np.unique(y)))

        for iter in range(1, self.max_iter + 1):
            # mix data 
            mix_id = np.random.permutation(m)
            for i in mix_id:
                w_new = W[-1]
                error = self.softmax(X[i].reshape(1,-1) @ W[-1]) - y[i]
                grad = X[i].reshape(1,-1).T.dot(error) + self.l2 * W[-1]
                w_new -= self.alpha * grad

                #Early stopping
                if iter%check_w_after == 0:                
                    if np.linalg.norm(w_new - W[-check_w_after]) < self.tol:
                        break 
                        
                W.append(w_new)
        self.weight = W[-1]
        
    def predict(self, X):
        '''
        Predict using the Softmax Regression model.
        
        Parameters
        ----------
        X : numpy array, shape (m, n)
        The matrix of inputs
        
        Return
        ----------
        Returns predicted values.
        '''
        # First column of this matrix is all ones (corresponding to x_0).
        X = np.append(np.ones((X.shape[0], 1)), X, axis = 1)    
        return np.argmax(self.softmax(X @ self.weight), axis = 1).reshape(-1,1)

In [3]:
def standardScaler(X):
    return (X - np.mean(X)) / np.std(X)

# Test

In [4]:
iris = datasets.load_iris()

X = iris.data
y = iris.target.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
X_test.shape

(45, 4)

In [6]:
model = Softmax_Regression()

X_train = standardScaler(X_train)
model.fit(X_train, y_train)

X_test = standardScaler(X_test)
y_pred = model.predict(X_test)

In [7]:
# No using l2
model = Softmax_Regression(alpha=0.1, max_iter=5000, l2=0)

X_train = standardScaler(X_train)
model.fit(X_train, y_train)

X_test = standardScaler(X_test)
y_pred = model.predict(X_test)

In [8]:
# Using l2 = 0.1
model = Softmax_Regression(alpha=0.1, max_iter=5000, l2=0.1)

model.fit(X_train, y_train)

y_pred1 = model.predict(X_test)

In [9]:
# Using l2 = 1
model = Softmax_Regression(alpha=0.1, max_iter=5000, l2=1)

model.fit(X_train, y_train)

y_pred2 = model.predict(X_test)

In [10]:
print('Score of using l2 = 0:', np.mean(y_pred == y_test))
print('Score of using l2 = 0.1:', np.mean(y_pred1 == y_test))
print('Score of using l2 = 1:', np.mean(y_pred1 == y_test))

Score of using l2 = 0: 1.0
Score of using l2 = 0.1: 0.9555555555555556
Score of using l2 = 1: 0.9555555555555556
