In [54]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [55]:
def normalize_feats(train_features, some_features):
    """
    Normalizes the sample data features.

    Parameters
    ----------
    train_features: A numpy array with the shape (N_train, d), where d is the number of features and N_train is the number of training samples.
    some_features: A numpy array with the shape (N_some, d), where d is the number of features and N_some is the number of samples to be normalized.

    Returns
    -------
    some_features_normalized: A numpy array with shape (N_some, d).
    """

    # your code here
    mean_train = train_features.mean(axis=0)
    std_train = train_features.std(axis=0)
    some_features_normalized = (some_features - mean_train) / std_train

    return some_features_normalized


X_train = (np.arange(35).reshape(5,7) ** 13) % 20
X_some = np.arange(7).reshape(1,7) * 10
X_norm_some = normalize_feats(X_train, X_some)
print(X_norm_some)
assert np.array_equal(X_norm_some.round(3), np.array([[-1.081, 1.638, 1.809, 3.595, 21.39, 6.722, 26.03 ]]))

[[-1.08068645  1.6377625   1.8093114   3.5947353  21.38956553  6.72244562
  26.03046111]]


In [56]:
def e_term(x_batch, y_batch, a, b):
    """
    Computes the margin of the data points.

    Parameters
    ----------
    x_batch: A numpy array with the shape (N, d), where d is the number of features and N is the batch size.
    y_batch: A numpy array with the shape (N, 1), where N is the batch size.
    a: A numpy array with the shape (d, 1), where d is the number of features. This is the weight vector.
    b: A scalar.

    Returns
    -------
    e_batch: A numpy array with shape (N, 1).
    """

    # your code here
    e_batch = 1 - y_batch * (x_batch @ a + b)

    return e_batch

x_batch_ = ((np.arange(35, dtype='int64').reshape(5,7) ** 13) % 20) / 7.
x = np.arange(35).reshape(5,7) ** 13
y_batch_ = (2. * (np.arange(5)>2) - 1.).reshape(-1,1)
a_ = (np.arange(7)* 0.2).reshape(-1,1)
b_ = 0.1
e_batch_ = e_term(x_batch_, y_batch_, a_, b_)
print(e_batch_)

assert np.array_equal(e_batch_.round(3), np.array([[ 5.986],[ 7.043],[ 7.529],[-4.014],[-1.7  ]]))

[[ 5.98571429]
 [ 7.04285714]
 [ 7.52857143]
 [-4.01428571]
 [-1.7       ]]


In [86]:
def loss_terms_ridge(e_batch, a, lam):
    """
    Computes the hinge and ridge regularization losses.

    Parameters
    ----------
    e_batch: A numpy array with the shape (N, 1), where N is the batch size. This is the output of the e_term function you wrote previously, and its kth element is e_k = 1 − y_k(a*x_k+b).
    a: A numpy array with the shape (d, 1), where d is the number of features. This is the weight vector.
    lam: A scalar representing the regularization coefficient 𝜆.

    Returns
    -------
    hinge_loss: The hinge regularization loss defined in the above cell.
    ridge_loss: The ridge regularization loss defined in the above cell.
    """

    # your code here
    #e_batch_ = e_batch.reshape(-1, 1)
    N = e_batch.size
    hinge_loss = e_batch[e_batch > 0].sum() / N
    #ridge_loss = (a.T @ a)[0][0] * lam / 2
    ridge_loss = (a**2).sum() * lam / 2

    return np.array((hinge_loss, ridge_loss))
    #return 1

e_batch_ = ((np.arange(35, dtype='int64').reshape(-1,1) ** 13) % 20) / 7.
a_ = (np.arange(7)* 0.2).reshape(-1,1)
lam_ = 10.

hinge_loss_1, reg_loss_1 = tuple(loss_terms_ridge(e_batch_, a_, lam_))
assert np.round(hinge_loss_1,3) == 1.114 and np.round(reg_loss_1,3) == 18.2

hinge_loss_2, reg_loss_2 = tuple(loss_terms_ridge(e_batch_-1., a_, lam_))
assert np.round(hinge_loss_2,3) == 0.412 and np.round(reg_loss_2,3) == 18.2

a_1=np.array([[-0.84862344],
       [ 0.1468467 ],
       [ 0.59857371],
       [ 0.28044845],
       [-0.39028563],
       [ 0.09438289],
       [-0.06230917],
       [ 0.19013069]])
e_batch_1=np.array([[  78.69078901,   11.54573106, -147.75362779, -159.58174745,
        -154.77509551, -187.45958187,  126.09360714, -118.50997555],
       [  72.87386923,  249.41084942,  226.50586665,  -62.46264488,
         293.53719711,  259.44748645,  256.83737789,  158.72302443],
       [ 231.48074986,  115.94234727,  256.60601673,   21.12917349,
        -174.29941841,  -62.70852331, -195.02513899, -163.03887267],
       [ 268.2489063 , -186.96698792, -179.1372718 ,  229.16104564,
        -148.37690665, -171.63841744, -109.62947691,  196.52564384],
       [ 183.22640848,  -41.12448044, -104.36992412, -141.74909545,
         -45.33461071,  292.89233735,  172.36608688,   47.71746109],
       [ 165.96268017,  248.28000276, -165.22200816,  292.65417449,
        -191.53259981,    7.32711771,    7.87010528,   26.30780551],
       [ 235.63002344,   60.90012659, -196.92631629,  -80.84847228,
        -174.94622075,  -66.7009531 ,   -0.31161369,    0.76540232],
       [-178.95460674,   68.15733965,  -59.42549974,  297.16120174,
        -166.81883028,  200.27044571,  251.82169371,  -47.62192147],
       [  48.92229377,  263.01703237, -163.52243429,  172.92660644,
        -166.37022705,  241.10065631, -185.67946236,  -78.9472858 ],
       [ -73.45896188,  199.32790555,  161.7626694 ,  193.86516518,
         173.74507505,  284.47381358,  147.51900352,  256.08136337],
       [ -64.98602759, -124.46846695, -163.39067543,   70.50457939,
         283.0198504 ,   16.58967044,   31.75113891, -189.86563728],
       [ 210.90682666,  -85.88673686,  293.61977045,  -36.16159631,
          59.12057467,   51.90213059,   -1.069974  ,  189.14035868],
       [ 130.51098329,    0.40754881,  -98.84234826,   97.32848935,
         136.20810796,  139.03284829, -150.84256303, -133.68335755],
       [ 279.14315026,  -18.71370934,  144.8956974 ,  231.9627674 ,
         247.9520764 ,  107.11673333,   98.17318232,  141.54285806]])
lam_1=np.array(7.39343468)
hinge_loss_1, reg_loss_1 = tuple(loss_terms_ridge(e_batch_1, a_1, lam_1))

print(hinge_loss_1, reg_loss_1)

95.67536626785716 5.101213489707721


In [102]:
def a_gradient_ridge(x_batch, y_batch, e_batch, a, lam):
    """
    Computes the ridge_regularized loss gradient w.r.t the weights vector.

    Parameters
    ----------
    x_batch: A numpy array with the shape (N, d), where d is the number of features and N is the batch size.
    y_batch: A numpy array with the shape (N, 1), where N is the batch size.
    e_batch: A numpy array with the shape (N, 1), where N is the batch size. This is the output of the e_term function you wrote previously, and its kth element is e_k = 1 − y_k(a*x_k+b).
    a: A numpy array with the shape (d, 1), where d is the number of features. This is the weight vector.
    lam: A scalar representing the regularization coefficient 𝜆.

    Returns
    -------
    grad_a: A numpy array with shape (d, 1) and defined as the gradient of the ridge regularized loss function.
    """

    # your code here
    N = x_batch.shape[0]
    g = np.sum(-1 * y_batch * x_batch * (e_batch > 0), axis=0, keepdims=True) / N
    grad_a = (lam * a) + g.T

    return grad_a


# Performing sanity checks on your implementation
x_batch_ = ((np.arange(35, dtype='int64').reshape(5,7) ** 13) % 20) / 7.
y_batch_ = (2. * (np.arange(5)>2) - 1.).reshape(-1,1)
a_ = (np.arange(7)* 0.2).reshape(-1,1)
b_ = 0.1
lam_ = 10.
e_batch_ = e_term(x_batch_, y_batch_, a_, b_)

grad_a_ = a_gradient_ridge(x_batch_, y_batch_, e_batch_, a_, lam_)
print(grad_a_)

assert np.array_equal(grad_a_.round(3), np.array([[ 0.314],[ 2.686],[ 5.057],[ 6.571],[ 8.657],[11.029],[12.829]]))

[[ 0.31428571]
 [ 2.68571429]
 [ 5.05714286]
 [ 6.57142857]
 [ 8.65714286]
 [11.02857143]
 [12.82857143]]


In [106]:
def b_derivative(y_batch, e_batch):
    """
    Computes the loss gradient with respect to the bias parameter b.

    Parameters
    ----------
    y_batch: A numpy array with the shape (N, 1), where N is the batch size.
    e_batch: A numpy array with the shape (N, 1), where N is the batch size. This is the output of the e_term function you wrote previously, and its kth element is e_k = 1 − y_k(a*x_k+b).

    Returns
    -------
    der_b: A scalar defined as the gradient of the hinge loss w.r.t the bias parameter b.
    """

    # your code here
    N = y_batch.shape[0]
    der_b = np.sum(-1 * y_batch * (e_batch > 0)) / N

    return der_b

# Performing sanity checks on your implementation
x_batch_ = ((np.arange(35, dtype='int64').reshape(5,7) ** 13) % 20) / 7.
y_batch_ = (2. * (np.arange(5)>2) - 1.).reshape(-1,1)
a_ = (np.arange(7)* 0.2).reshape(-1,1)
b_ = -5.
e_batch_ = e_term(x_batch_, y_batch_, a_, b_)

grad_b_ = b_derivative(y_batch_, e_batch_)
print(grad_b_)

assert np.round(grad_b_, 3) == 0.2

0.2


In [108]:
def loss_terms_lasso(e_batch, a, lam):
    """
    Computes the hinge and lasso regularization losses.

    Parameters
    ----------
    e_batch: A numpy array with the shape (N, 1), where N is the batch size. This is the output of the e_term function you wrote previously, and its kth element is e_k = 1 − y_k(a*x_k+b).
    a: A numpy array with the shape (d, 1), where d is the number of features. This is the weight vector.
    lam: A scalar representing the regularization coefficient 𝜆.

    Returns
    -------
    hinge_loss: The hinge loss scalar as defined in the cell above.
    lasso_loss: The lasso loss scalar as defined in the cell above.
    """

    # your code here
    N = e_batch.size
    hinge_loss = e_batch[e_batch > 0].sum() / N
    lasso_loss = lam * np.abs(a).sum()

    return np.array((hinge_loss, lasso_loss))

# Performing sanity checks on your implementation
e_batch_ = ((np.arange(35, dtype='int64').reshape(-1,1) ** 13) % 20) / 7.
a_ = (np.arange(7)* 0.2).reshape(-1,1)
lam_ = 10.

hinge_loss_1, reg_loss_1 = tuple(loss_terms_lasso(e_batch_, a_, lam_))
assert np.round(hinge_loss_1,3) == 1.114 and np.round(reg_loss_1,3) == 42.0, np.round(reg_loss_1,3)

hinge_loss_2, reg_loss_2 = tuple(loss_terms_lasso(e_batch_-1., a_, lam_))
assert np.round(hinge_loss_2,3) == 0.412 and np.round(reg_loss_2,3) == 42.0, np.round(reg_loss_2,3)

In [133]:
def a_gradient_lasso(x_batch, y_batch, e_batch, a, lam):
    """
    Computes the lasso-regularized loss sub-gradient w.r.t the weights vector

    Parameters
    ----------
    x_batch: A numpy array with the shape (N, d), where d is the number of features and N is the batch size.
    y_batch: A numpy array with the shape (N, 1), where N is the batch size.
    e_batch: A numpy array with the shape (N, 1), where N is the batch size. This is the output of the e_term function you wrote previously, and its kth element is e_k = 1 − y_k(a*x_k+b).
    a: A numpy array with the shape (d, 1), where d is the number of features. This is the weight vector.
    lam: A scalar representing the regularization coefficient 𝜆.

    Returns
    -------
    grad_a: A numpy array with shape (d, 1) and defined as the gradient of the lasso-regularized loss function w.r.t the weights vector.
    """

    # your code here
    N = x_batch.shape[0]
    g = np.sum(-1 * y_batch * x_batch * (e_batch > 0), axis=0, keepdims=True).T / N
    sign_a = (a > 0) + ((a < 0) * -1 )
    grad_a = (lam * sign_a) + g

    return grad_a

# Performing sanity checks on your implementation
x_batch_ = ((np.arange(35, dtype='int64').reshape(5,7) ** 13) % 20) / 7.
y_batch_ = (2. * (np.arange(5)>2) - 1.).reshape(-1,1)
a_ = (np.arange(7)* 0.2).reshape(-1,1)
b_ = 0.1
lam_ = 10.
e_batch_ = e_term(x_batch_, y_batch_, a_, b_)

grad_a_lasso_ = a_gradient_lasso(x_batch_, y_batch_, e_batch_, a_, lam_)

assert np.array_equal(grad_a_lasso_.round(3), np.array([[ 0.314], [10.686], [11.057],[10.571], [10.657], [11.029], [10.829]]))