**Submission By: Ayesh Ahmad**

**CMS: 365966**

In [54]:
import numpy as np
import time

#1. Logistic Regression Single Prediction

In [55]:
def logistic_regression_single_prediction(w, x):
    """
    Args:
      w (ndarray): Weight vector
      x (ndarray): Feature vector

    Returns:
      y_pred (int): Binary prediction (0 or 1) for the given input.
    """
    if len(w.shape) == 1 and len(x.shape) == 1:
        z = np.dot(w, x)
        y_pred_prob = 1 / (1 + np.exp(-z))
        y_pred = 1 if y_pred_prob >= 0.5 else 0
        return y_pred
    else:
        raise ValueError("'w' and 'x' must be vectors.")

w = np.array([1, 2, 3])
x = np.array([1, 2, 3])

test = logistic_regression_single_prediction(w, x)
print(test)

1


#2. Logistic Regression Vector Prediction

In [56]:
def logistic_regression_vector_prediction(w, X):
    """
    Args:
      w (ndarray): Weight vector (shape: (n_features,))
      X (ndarray): Feature matrix (shape: (n_samples, n_features))

    Returns:
      y_pred_prob (ndarray): Probabalistic prediction vector for the given input (shape: (n_samples,)).
    """
    if len(w.shape) == 1 and len(X.shape) == 2 and w.shape[0] == X.shape[1]:
        z = np.dot(X, w)
        y_pred_prob = 1 / (1 + np.exp(-z))
        return y_pred_prob
    else:
        raise ValueError("'w' must be a weight vector (shape: (n_features,)) and 'X' must be a feature matrix (shape: (n_samples, n_features)).")

w = np.array([0.5, 0.3, -0.2])
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
predictions = logistic_regression_vector_prediction(w, X)
print(predictions)

[0.62245933 0.90887704 0.9836975 ]


#3. Logistic Loss

In [57]:
def logistic_loss(w, X, y):
    """
    Args:
      w (ndarray): Weight vector (shape: (n_features,))
      X (ndarray): Feature matrix (shape: (n_samples, n_features))
      y (ndarray): Label vector (shape: (n_samples,))

    Returns:
      loss (float): Logistic Loss for the given input.
    """
    y_pred_prob = logistic_regression_vector_prediction(w, X)
    loss = -np.mean(y * np.log(y_pred_prob) + (1 - y) * np.log(1 - y_pred_prob))
    return loss

w = np.array([0.5, 0.3, -0.2])
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
y = np.array([1, 0, 1])
loss = logistic_loss(w, X, y)
print(loss)

0.9620197653436594


#4. Gradient of Logistic Loss

In [58]:
def logistic_loss_gradient(w, X, y):
    """
    Args:
      w (ndarray): Weight vector (shape: (n_features,))
      X (ndarray): Feature matrix (shape: (n_samples, n_features))
      y (ndarray): Label vector (shape: (n_samples,))

    Returns:
      gradient (ndarray): Gradient of Logistic Loss with respect to the weight vector (shape: (n_features,)).
    """
    y_pred_prob = logistic_regression_vector_prediction(w, X)
    gradient = np.dot(X.T, y_pred_prob - y) / len(y)
    return gradient

w = np.array([0.5, 0.3, -0.2])
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
y = np.array([1, 0, 1])
gradient = logistic_loss_gradient(w, X, y)
print(gradient)

[1.04795    1.21962795 1.39130591]


#5. Gradient Descent Algorithm


In [59]:
def gradient_descent_logistic_regression(w, X, y, learning_rate, stopping_criterion, batch_size=None, max_iters=20):
    """
    Args:
      w (ndarray): Initial weight vector (shape: (n_features,))
      X (ndarray): Feature matrix (shape: (n_samples, n_features))
      y (ndarray): Label vector (shape: (n_samples,))
      learning_rate (float): Learning rate for gradient descent
      stopping_criterion (float): Value indicating the threshold for stopping the algorithm
      batch_size (int): Size of the mini-batches for stochastic gradient descent. Default is None (vanilla gradient descent).
      max_iters (int): Maximum number of iterations. Default is 1000.

    Returns:
      w_optimized (ndarray): Optimized weight vector.
    """
    if batch_size is None:
        batch_size = len(y)

    w_optimized = w.copy()
    prev_loss = np.inf

    print("\n+---------------+-------------------------------+")
    print(f"|\tEpoch\t|\t\tLoss\t\t|")
    print("+---------------+-------------------------------+")

    for epoch in range(1, max_iters + 1):
        indices = np.random.choice(len(y), batch_size, replace=False)
        X_batch = X[indices]
        y_batch = y[indices]

        gradient = logistic_loss_gradient(w_optimized, X_batch, y_batch)
        w_optimized -= learning_rate * gradient

        current_loss = logistic_loss(w_optimized, X, y)
        print(f"|\t{epoch}\t|\t{current_loss}\t|")

        if abs(prev_loss - current_loss) < stopping_criterion:
            break
        prev_loss = current_loss

    print("+---------------+-------------------------------+")
    return w_optimized

# Vanilla Gradient Descent
w_initial = np.array([0.5, 0.3, -0.2])
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
y = np.array([1, 0, 1])
learning_rate = 0.01
stopping_criterion = 1e-6
w_optimized = gradient_descent_logistic_regression(w_initial, X, y, learning_rate, stopping_criterion)
print("Vanilla Gradient Descent - Final Weights:", w_optimized, "\n\n")

# Stochastic Gradient Descent
batch_size = 1
w_initial = np.array([0.5, 0.3, -0.2])
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
y = np.array([1, 0, 1])
learning_rate = 0.01
stopping_criterion = 1e-6
w_optimized = gradient_descent_logistic_regression(w_initial, X, y, learning_rate, stopping_criterion, batch_size)
print("Stochastic Gradient Descent - Final Weights:", w_optimized)


+---------------+-------------------------------+
|	Epoch	|		Loss		|
+---------------+-------------------------------+
|	1	|	0.9177997620048218	|
|	2	|	0.8776322597924787	|
|	3	|	0.8417320992010113	|
|	4	|	0.8102285185170045	|
|	5	|	0.7831385499494887	|
|	6	|	0.7603493923032522	|
|	7	|	0.7416155999228593	|
|	8	|	0.7265740855060026	|
|	9	|	0.7147752814186551	|
|	10	|	0.7057241847611401	|
|	11	|	0.6989225630912159	|
|	12	|	0.6939043447145256	|
|	13	|	0.6902594378088655	|
|	14	|	0.6876451162496515	|
|	15	|	0.6857870560354592	|
|	16	|	0.6844734413465866	|
|	17	|	0.6835455177141858	|
|	18	|	0.6828871770397896	|
|	19	|	0.6824151855124928	|
|	20	|	0.6820708422140909	|
+---------------+-------------------------------+
Vanilla Gradient Descent - Final Weights: [ 0.39098838  0.18247049 -0.3260474 ] 



+---------------+-------------------------------+
|	Epoch	|		Loss		|
+---------------+-------------------------------+
|	1	|	0.8090238895803048	|
|	2	|	0.8203225592970829	|
|	3	|	0.83094449812132

#6. Application of Logistic Regression

##i. Loading dataset and normalizing

In [62]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the MNIST dataset
mnist = fetch_openml('mnist_784', version=1, cache=True, as_frame=False)

# Pandas data frame with feature vectors
X = mnist.data

# Labels
y = mnist.target

# Labels converted to integers
y = y.astype(int)

# Split the dataset into training and test sets (using 50,000 examples for training)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=50000, random_state=42)

# Normalize the feature vectors
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

print("Training set shape:", X_train_normalized.shape, y_train.shape)
print("Test set shape:", X_test_normalized.shape, y_test.shape)

  warn(


Training set shape: (50000, 784) (50000,)
Test set shape: (20000, 784) (20000,)


##ii. Creating a Linear Regression Model

In [69]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelBinarizer

def logistic_regression_one_vs_rest(X_train, y_train, X_test, y_test, learning_rate, stopping_criterion, batch_size=None, max_iters=20):
    """
    Args:
      X_train (ndarray): Training feature matrix (shape: (n_train_samples, n_features))
      y_train (ndarray): Training label vector (shape: (n_train_samples,))
      X_test (ndarray): Test feature matrix (shape: (n_test_samples, n_features))
      y_test (ndarray): Test label vector (shape: (n_test_samples,))
      learning_rate (float): Learning rate for gradient descent
      stopping_criterion (float): Value indicating the threshold for stopping the algorithm
      batch_size (int): Size of the mini-batches for stochastic gradient descent. Default is None (vanilla gradient descent).
      max_iters (int): Maximum number of iterations. Default is 20.

    Returns:
      accuracies (list): List of accuracies for each class
      precisions (list): List of precisions for each class
      recalls (list): List of recalls for each class
      f1_scores (list): List of F-1 scores for each class
    """
    # Binarize labels
    lb = LabelBinarizer()
    y_train_bin = lb.fit_transform(y_train)

    # Initialize lists for evaluation metrics
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    # Train one-vs-rest logistic regression for each class
    for i in range(10):  # 10 classes in MNIST
        print(f"\n\t\tTraining for class {i}")
        y_train_class = y_train_bin[:, i]
        w_initial = np.zeros(X_train.shape[1])
        w_optimized = gradient_descent_logistic_regression(w_initial, X_train, y_train_class, learning_rate, stopping_criterion, batch_size, max_iters)

        # Predictions for current class
        y_pred_prob = logistic_regression_vector_prediction(w_optimized, X_test)
        y_pred_class = (y_pred_prob >= 0.5).astype(int)

        # Evaluation metrics for current class
        accuracies.append(accuracy_score(y_test == i, y_pred_class))
        precisions.append(precision_score(y_test == i, y_pred_class))
        recalls.append(recall_score(y_test == i, y_pred_class))
        f1_scores.append(f1_score(y_test == i, y_pred_class))

    return accuracies, precisions, recalls, f1_scores

##iii. Logistic Regression using Vanilla Gradient Descent

In [76]:
learning_rate = 0.01
stopping_criterion = 1e-6
batch_size = None
max_iters = 20

start_time = time.time()
accuracies, precisions, recalls, f1_scores = logistic_regression_one_vs_rest(X_train_normalized, y_train, X_test_normalized, y_test, learning_rate, stopping_criterion, batch_size, max_iters)
end_time = time.time()
execution_time = end_time - start_time

# Display results
print("\nExecution Time: ", execution_time, "\n")
print("+-------+------------+-----------+--------+-----------+")
print("| Class |  Accuracy  | Precision | Recall |  F1-Score |")
print("+-------+------------+-----------+--------+-----------+")
for i in range(10):
    print(f"|   {i}   |  {accuracies[i]:.4f}    |  {precisions[i]:.4f}   | {recalls[i]:.4f} |  {f1_scores[i]:.4f}   |")
print("+-------+------------+-----------+--------+-----------+")


		Training for class 0

+---------------+-------------------------------+
|	Epoch	|		Loss		|
+---------------+-------------------------------+
|	1	|	0.6771124251481307	|
|	2	|	0.6637359671670245	|
|	3	|	0.6524949581048886	|
|	4	|	0.6429595878897816	|
|	5	|	0.6347873860706502	|
|	6	|	0.6277098994911612	|
|	7	|	0.6215180113150275	|
|	8	|	0.6160487947420292	|
|	9	|	0.6111747831715824	|
|	10	|	0.6067956081431392	|
|	11	|	0.6028316296850545	|
|	12	|	0.5992191320981775	|
|	13	|	0.5959067090818533	|
|	14	|	0.592852538202656	|
|	15	|	0.5900223163273051	|
|	16	|	0.587387686087994	|
|	17	|	0.5849250282269338	|
|	18	|	0.582614527930618	|
|	19	|	0.5804394476093262	|
|	20	|	0.5783855562877406	|
+---------------+-------------------------------+

		Training for class 1

+---------------+-------------------------------+
|	Epoch	|		Loss		|
+---------------+-------------------------------+
|	1	|	0.6805177569600246	|
|	2	|	0.6696428145028277	|
|	3	|	0.6602276771781133	|
|	4	|	0.652027571100754	|
|	5	|	0

##iv. Logistic Regression using Stochastic Gradient Descent

In [77]:
learning_rate = 0.01
stopping_criterion = 1e-6
batch_size = 1000
max_iters = 20

start_time = time.time()
accuracies, precisions, recalls, f1_scores = logistic_regression_one_vs_rest(X_train_normalized, y_train, X_test_normalized, y_test, learning_rate, stopping_criterion, batch_size, max_iters)
end_time = time.time()
execution_time = end_time - start_time

# Display results
print("\nExecution Time: ", execution_time, "\n")
print("+-------+------------+-----------+--------+-----------+")
print("| Class |  Accuracy  | Precision | Recall |  F1-Score |")
print("+-------+------------+-----------+--------+-----------+")
for i in range(10):
    print(f"|   {i}   |  {accuracies[i]:.4f}    |  {precisions[i]:.4f}   | {recalls[i]:.4f} |  {f1_scores[i]:.4f}   |")
print("+-------+------------+-----------+--------+-----------+")


		Training for class 0

+---------------+-------------------------------+
|	Epoch	|		Loss		|
+---------------+-------------------------------+
|	1	|	0.6777574451143046	|
|	2	|	0.6631691853061964	|
|	3	|	0.6504047092357471	|
|	4	|	0.6411404234361422	|
|	5	|	0.6327874479916622	|
|	6	|	0.626314230205495	|
|	7	|	0.6208479139342598	|
|	8	|	0.6156811573205712	|
|	9	|	0.6104755324909369	|
|	10	|	0.6063022830941204	|
|	11	|	0.6023525451539274	|
|	12	|	0.5987786259366692	|
|	13	|	0.5955858850621808	|
|	14	|	0.592603988216997	|
|	15	|	0.5895727337708618	|
|	16	|	0.5870638024119704	|
|	17	|	0.5843161452459757	|
|	18	|	0.5819338245503629	|
|	19	|	0.5798823441695566	|
|	20	|	0.5779653291389499	|
+---------------+-------------------------------+

		Training for class 1

+---------------+-------------------------------+
|	Epoch	|		Loss		|
+---------------+-------------------------------+
|	1	|	0.6800299257736239	|
|	2	|	0.66930088699236	|
|	3	|	0.6597198231750777	|
|	4	|	0.6523286850259823	|
|	5	|	0

#7. Results

Using a **learning rate of 0.01** over **20 Epochs** for both vanilla and stochastic gradient descent, the comparison is as follows:

| Gradient Descent Type | Execution Time (s) | Class | Accuracy | Precision | Recall | F1-Score |
|---|---|---|---|---|---|---|
| Vanilla | 62.28 |   0   |  0.7483    |  0.2790   | 0.9847 |  0.4348   |
|  |  |   1   |  0.6854    |  0.2622   | 0.9973 |  0.4152   |
|  |  |   2   |  0.7116    |  0.2582   | 0.9765 |  0.4084   |
|  |  |   3   |  0.7114    |  0.2596   | 0.9728 |  0.4099   |
|  |  |   4   |  0.6783    |  0.2210   | 0.9956 |  0.3617   |
|  |  |   5   |  0.6614    |  0.2088   | 0.9737 |  0.3438   |
|  |  |   6   |  0.7522    |  0.2862   | 0.9935 |  0.4444   |
|  |  |   7   |  0.7211    |  0.2772   | 0.9875 |  0.4329   |
|  |  |   8   |  0.6673    |  0.2180   | 0.9587 |  0.3552   |
|  |  |   9   |  0.6915    |  0.2406   | 0.9833 |  0.3866   |
| Vanilla | 11.35 |   0   |  0.7452    |  0.2764   | 0.9837 |  0.4316   |
|  |  |   1   |  0.6793    |  0.2584   | 0.9969 |  0.4105   |
|  |  |   2   |  0.7088    |  0.2563   | 0.9760 |  0.4060   |
|  |  |   3   |  0.7120    |  0.2602   | 0.9738 |  0.4107   |
|  |  |   4   |  0.6778    |  0.2207   | 0.9951 |  0.3613   |
|  |  |   5   |  0.6597    |  0.2079   | 0.9737 |  0.3427   |
|  |  |   6   |  0.7490    |  0.2836   | 0.9940 |  0.4413   |
|  |  |   7   |  0.7147    |  0.2728   | 0.9884 |  0.4275   |
|  |  |   8   |  0.6647    |  0.2168   | 0.9597 |  0.3537   |
|  |  |   9   |  0.6952    |  0.2427   | 0.9818 |  0.3892   |

**Execution Time**: SGD is significantly faster than vanilla GD. The execution time for SGD is much lower compared to vanilla GD, indicating that SGD converges faster due to its update frequency with smaller batches.

**Accuracy**: The accuracy values for both vanilla GD and SGD are similar across different classes, with SGD sometimes performing slightly better. This suggests that both methods are effective for this classification task.

**Precision, Recall, and F1-Score**: The precision, recall, and F1-score values for both vanilla GD and SGD are consistent across different classes. There is no significant difference in performance between the two methods in terms of these metrics.

**Overall Performance**: Both vanilla GD and SGD perform well on the MNIST dataset, achieving high accuracy, precision, recall, and F1-scores across different classes. However, SGD offers the advantage of faster convergence, making it a more efficient choice for large datasets.

##Conclusion

In summary, based on this data, we can conclude that SGD is a preferable choice over vanilla GD for logistic regression on the MNIST dataset due to its faster convergence and similar performance in terms of accuracy and other metrics.