# Import libraries

In [None]:
# Import libraries
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# 1. Load MNIST dataset

In [None]:
# Load the MNIST dataset
mnist = fetch_openml('mnist_784')

  warn(



# 2. Subset your data to use only class 0 and class 1 for the next steps.

In [None]:
# Convert data and labels to numpy arrays
X = np.array(mnist.data, dtype='float32')
y = np.array(mnist.target, dtype='int64')

# Subset the data to include only class 0 and class 1
X = X[(y == 0) | (y == 1)]
y = y[(y == 0) | (y == 1)]


#3. Standardize your dataset

In [None]:
def standardize_dataset(data):
    # Calculate the mean and standard deviation of each feature
    means = np.mean(data)
    stds = np.std(data)

    # Subtract the mean from each feature and divide by its standard deviation
    standardized_data = (data - means) / stds

    return standardized_data

In [None]:
# Standardize the dataset
X = standardize_dataset(X)

#4. Divide data into training and validation set

In [None]:
def train_val_test_split(X, y, train_size=0.6, val_size=0.2, test_size=0.2):

    # Convert proportions to sizes
    train_size = int(train_size * X.shape[0])
    val_size = int(val_size * X.shape[0])
    test_size = int(test_size * X.shape[0])

    # Split the data and labels into training, validation, and test sets
    X_train = X[:train_size]
    y_train = y[:train_size]
    X_val = X[train_size:train_size+val_size]
    y_val = y[train_size:train_size+val_size]
    X_test = X[train_size+val_size:]
    y_test = y[train_size+val_size:]

    return X_train, y_train, X_val, y_val, X_test, y_test

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(X, y, train_size=0.6, val_size=0.2, test_size=0.2)

In [None]:
def accuracy(y_pred, y_test):
    return (np.sum(y_pred==y_test)/len(y_test))*100

In [None]:
def f1_score(y_pred, y_test):
    # Calculate true positives, false positives, and false negatives
    tp = sum((a == 1 and b == 1) for (a, b) in zip(y_test, y_pred))
    fp = sum((a == 0 and b == 1) for (a, b) in zip(y_test, y_pred))
    fn = sum((a == 1 and b == 0) for (a, b) in zip(y_test, y_pred))

    # Calculate precision and recall
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall)

    return f1_score


In [None]:
X.shape

(14780, 784)

# 5. Implement logistic regression

In [None]:
class LogisticRegression:
    def __init__(self, learning_rate=0.001, n_iterations=30):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for i in range(self.n_iterations):
            # Calculate the model
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self._sigmoid(linear_model)

            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = self._sigmoid(linear_model)
        y_pred_cls = [1 if i > 0.5 else 0 for i in y_pred]
        return np.array(y_pred_cls)

    def _sigmoid(self, x):
        return 1/(1 + np.exp(-x))

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_train_pred_lr = log_reg.predict(X_train)
y_val_pred_lr = log_reg.predict(X_val)

print(f'Training score:   {str(f1_score(y_train_pred_lr, y_train))}%')
print(f'Validation score: {str(f1_score(y_val_pred_lr, y_val))}%')

Training score:   0.9960075646144149%
Validation score: 0.9970995810505962%


# 6. Use L1 regularization with gradient descent optimizer.

In [None]:
class LogisticRegressionL1:
    def __init__(self, learning_rate=0.1, n_iterations=10, l1_lambda = 0.1):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.l1_lambda = l1_lambda

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        for i in range(self.n_iterations):
            # Calculate the model
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self._sigmoid(linear_model)

            dw = (2 * np.dot(X.T, (y_pred - y))) + self.l1_lambda * np.sign(self.weights)
            db = 2 * np.sum(y_pred - y) + self.l1_lambda * np.sign(self.bias)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            return self.cost_l1(X, y, y_pred)

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = self._sigmoid(linear_model)
        y_pred_cls = [1 if i > 0.5 else 0 for i in y_pred]
        return np.array(y_pred_cls)

    def _sigmoid(self, x):
        return 1/(1 + np.exp(-x))

    def cost_l1(self, X, y, y_pred):

      cost = np.mean((y - y_pred)**2) + self.l1_lambda * np.sum(np.abs(self.weights))
      return cost


In [None]:
lambdas = [0.1, 1]

for i in lambdas:
  log_reg_l1 = LogisticRegressionL1(l1_lambda = i)
  cost = log_reg_l1.fit(X_train, y_train)
  y_train_pred_l1 = log_reg_l1.predict(X_train)
  y_val_pred_l1 = log_reg_l1.predict(X_val)

  print(f' -- For lambda {i}')
  print(f'Training score:   {str(f1_score(y_train_pred_l1, y_train))}%')
  print(f'Validation score: {str(f1_score(y_val_pred_l1, y_val))}%')
  print(f'Cost:', cost)
  print(f'-----------------------------')

  return 1/(1 + np.exp(-x))


 -- For lambda 0.1
Training score:   0.9957877000842459%
Validation score: 0.9974176888315042%
Cost: 18595.52482217892
-----------------------------
 -- For lambda 1
Training score:   0.9957877000842459%
Validation score: 0.9974176888315042%
Cost: 185952.9982217892
-----------------------------


# 7. Use mini-batch gradient descent optimizer.

In [None]:
class LogisticRegressionMiniBatch:
    def __init__(self, learning_rate=0.001, n_iterations=10, batch_size=10):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.batch_size = batch_size

    def mini_batch_fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        num_batches = n_samples // self.batch_size

        for i in range(self.n_iterations):
            for j in range(num_batches):
                # Select a mini-batch
                start_idx = j * self.batch_size
                end_idx = start_idx + self.batch_size
                X_batch = X[start_idx:end_idx]
                y_batch = y[start_idx:end_idx]

                # Calculate the model
                linear_model = np.dot(X_batch, self.weights) + self.bias
                y_pred = self._sigmoid(linear_model)

                dw = (1/self.batch_size) * np.dot(X_batch.T, (y_pred - y_batch))
                db = (1/self.batch_size) * np.sum(y_pred - y_batch)

                self.weights -= self.learning_rate * dw
                self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = self._sigmoid(linear_model)
        y_pred_cls = [1 if i > 0.5 else 0 for i in y_pred]
        return np.array(y_pred_cls)

    def _sigmoid(self, x):
        return 1/(1 + np.exp(-x))


In [None]:
batch_sizes = [5, 50, 500, 5000]
for b in batch_sizes:
  mini_batch = LogisticRegressionMiniBatch(batch_size=b)
  mini_batch.mini_batch_fit(X_train, y_train)
  y_train_pred_mb = mini_batch.predict(X_train)
  y_val_pred_mb = mini_batch.predict(X_val)
  print(f' -- For batch size {b}')
  print(f' Training score:   {str(f1_score(y_train_pred_mb, y_train))}%')
  print(f' Validation score: {str(f1_score(y_val_pred_mb, y_val))}%')
  print(f'-----------------------------')

 -- For batch size 5
 Training score:   0.9989478114478114%
 Validation score: 0.9993535875888817%
-----------------------------
 -- For batch size 50
 Training score:   0.9974763406940063%
 Validation score: 0.9987080103359173%
-----------------------------
 -- For batch size 500
 Training score:   0.9961130370837273%
 Validation score: 0.9970995810505962%
-----------------------------
 -- For batch size 5000
 Training score:   0.9960042060988433%
 Validation score: 0.9970977104159949%
-----------------------------


**Conclustion**
- With smaller batch size, the number of updates of the model parameters increases during each iteration, so the model converges to the optimal parameters faster during each iteration.

# 8. RMS Prob optimizer and Adam optimizer

In [None]:
class LogisticRegressionRMS:
    def __init__(self, learning_rate=0.001, n_iterations=30, beta=0.9, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.beta = beta
        self.epsilon = epsilon

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        self.cache_w = np.zeros(n_features)
        self.cache_b = 0

        for i in range(self.n_iterations):
            # Calculate the model
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self._sigmoid(linear_model)

            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)

            self.cache_w = self.beta * self.cache_w + (1 - self.beta) * np.power(dw, 2)
            self.cache_b = self.beta * self.cache_b + (1 - self.beta) * np.power(db, 2)

            self.weights -= self.learning_rate * dw / (np.sqrt(self.cache_w) + self.epsilon)
            self.bias -= self.learning_rate * db / (np.sqrt(self.cache_b) + self.epsilon)

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = self._sigmoid(linear_model)
        y_pred_cls = [1 if i > 0.5 else 0 for i in y_pred]
        return np.array(y_pred_cls)

    def _sigmoid(self, x):
        return 1/(1 + np.exp(-x))


In [None]:
rms = LogisticRegressionRMS()
rms.fit(X_train, y_train)
y_train_pred_rms = rms.predict(X_train)
y_val_pred_rms = rms.predict(X_val)
print(f'-----------------------------')
print(f' Training score:   {str(f1_score(y_train_pred_rms, y_train))}%')
print(f' Validation score: {str(f1_score(y_val_pred_rms, y_val))}%')
print(f'-----------------------------')

-----------------------------
 Training score:   0.9956928248765627%
 Validation score: 0.9970995810505962%
-----------------------------


In [None]:
class LogisticRegressionADAM:
    def __init__(self, learning_rate=0.001, n_iterations=30, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        self.m_w = np.zeros(n_features)
        self.v_w = np.zeros(n_features)
        self.m_b = 0
        self.v_b = 0
        t = 0

        for i in range(self.n_iterations):
            # Calculate the model
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self._sigmoid(linear_model)

            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)

            t += 1
            self.m_w = self.beta1 * self.m_w + (1 - self.beta1) * dw
            self.v_w = self.beta2 * self.v_w + (1 - self.beta2) * np.power(dw, 2)
            self.m_b = self.beta1 * self.m_b + (1 - self.beta1) * db
            self.v_b = self.beta2 * self.v_b + (1 - self.beta2) * np.power(db, 2)

            m_w_hat = self.m_w / (1 - np.power(self.beta1, t))
            v_w_hat = self.v_w / (1 - np.power(self.beta2, t))
            m_b_hat = self.m_b / (1 - np.power(self.beta1, t))
            v_b_hat = self.v_b / (1 - np.power(self.beta2, t))

            self.weights -= self.learning_rate * m_w_hat / (np.sqrt(v_w_hat) + self.epsilon)
            self.bias -= self.learning_rate * m_b_hat / (np.sqrt(v_b_hat) + self.epsilon)

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = self._sigmoid(linear_model)
        y_pred_cls = [1 if i > 0.5 else 0 for i in y_pred]
        return np.array(y_pred_cls)

    def _sigmoid(self, x):
        return 1/(1 + np.exp(-x))


In [None]:
adam = LogisticRegressionADAM()
adam.fit(X_train, y_train)
y_train_pred_adam = adam.predict(X_train)
y_val_pred_adam = adam.predict(X_val)
print(f'-----------------------------')
print(f' Training score:   {str(f1_score(y_train_pred_adam, y_train))}%')
print(f' Validation score: {str(f1_score(y_val_pred_adam, y_val))}%')
print(f'-----------------------------')

-----------------------------
 Training score:   0.9947512072223389%
 Validation score: 0.9964573268921095%
-----------------------------
