# Ridge Regression {-}

## 3.1 {-}

In [None]:
class L2NormPenaltyNode(object):
    """ Node computing l2_reg * ||w||^2 for scalars l2_reg and vector w"""
    def __init__(self, l2_reg, w, node_name):
        """ 
        Parameters:
        l2_reg: a scalar value >=0 (not a node)
        w: a node for which w.out is a numpy vector
        node_name: node's name (a string)
        """
        self.node_name = node_name
        self.out = None
        self.d_out = None
        self.l2_reg = np.array(l2_reg)
        self.w = w
        
    def forward(self):
        self.out = self.l2_reg * (self.w.out @ self.w.out)
        self.d_out = np.zeros(self.out.shape)
        return self.out

    def backward(self):
        d_w = self.d_out * 2 * self.l2_reg * self.w.out
        self.w.d_out += d_w
        return self.d_out

    def get_predecessors(self):
        return [self.w]

\newpage

## 3.2 {-}

In [None]:
class SumNode(object):
    """ Node computing a + b, for numpy arrays a and b"""
    def __init__(self, a, b, node_name):
        """ 
        Parameters:
        a: node for which a.out is a numpy array
        b: node for which b.out is a numpy array of the same shape as a
        node_name: node's name (a string)
        """
        self.a = a
        self.b = b
        self.node_name = node_name
        self.out = None
        self.d_out = None

    def forward(self):
        self.out = self.a.out + self.b.out
        self.d_out = np.zeros(self.out.shape)
        return self.out

    def backward(self):
        d_a = self.d_out
        d_b = self.d_out
        self.a.d_out += d_a
        self.b.d_out += d_b
        return self.d_out

    def get_predecessors(self):
        return [self.a, self.b]

\newpage

## 3.3 {-}

In [None]:
class RidgeRegression(BaseEstimator, RegressorMixin):
    """ Ridge regression with computation graph """
    def __init__(self, l2_reg=1, step_size=.005,  max_num_epochs = 5000):
        self.max_num_epochs = max_num_epochs
        self.step_size = step_size

        # Build computation graph
        self.x = nodes.ValueNode(node_name="x") # to hold a vector input
        self.y = nodes.ValueNode(node_name="y") # to hold a scalar response
        self.w = nodes.ValueNode(node_name="w") # to hold the parameter vector
        self.b = nodes.ValueNode(node_name="b") # to hold the bias parameter (scalar)
        self.prediction = nodes.VectorScalarAffineNode(x=self.x, w=self.w, b=self.b,
                                                 node_name="prediction")
        self.square_loss = nodes.SquaredL2DistanceNode(a=self.prediction, b=self.y,
                                                 node_name="square loss")
        self.reg = nodes.L2NormPenaltyNode(l2_reg=l2_reg, w=self.w, node_name='l2 regularization')
        self.objective = nodes.SumNode(a = self.square_loss, b=self.reg, node_name = 'objective function')

        # Group nodes into types to construct computation graph function
        self.inputs = [self.x]
        self.outcomes = [self.y]
        self.parameters = [self.w, self.b]

        self.graph = graph.ComputationGraphFunction(self.inputs, self.outcomes,
                                                          self.parameters, self.prediction,
                                                          self.objective)

\newpage

<img src="3.1-ridge.png">

No regularization avg training loss:  0.031891987351410654

Regularization avg training loss:  0.20162615691882524

# MLP Implementation {-}

## 4.2.1 {-}

In [None]:
class AffineNode(object):
    """Node implementing affine transformation (W,x,b)-->Wx+b, where W is a matrix,
    and x and b are vectors
        Parameters:
        W: node for which W.out is a numpy array of shape (m,d)
        x: node for which x.out is a numpy array of shape (d)
        b: node for which b.out is a numpy array of shape (m) (i.e. vector of length m)
    """
    def __init__(self, W, x, b, node_name):
        """ 
        Parameters:
        a: node for which a.out is a numpy array
        b: node for which b.out is a numpy array of the same shape as a
        node_name: node's name (a string)
        """
        self.W = W
        self.x = x
        self.b = b
        self.node_name = node_name
        self.out = None
        self.d_out = None

    def forward(self):
        self.out = self.W.out @ self.x.out + self.b.out
        self.d_out = np.zeros(self.out.shape)
        return self.out

    def backward(self):
        d_W = np.outer(self.d_out, self.x.out)

        # Handle cases where W is an array vs a matrix
        if len(self.W.out.shape) == 1:
            d_x = self.W.out.T * self.d_out
        else:
            d_x = self.W.out.T @ self.d_out
        d_b = self.d_out

        # Reshape d_W when it is supposed to be an array
        if d_W.shape[0] == 1:
            d_W = d_W.flatten()
        self.W.d_out += d_W
        self.x.d_out += d_x
        self.b.d_out += d_b
        return self.d_out

    def get_predecessors(self):
        return [self.W, self.x, self.b]

\newpage

## 4.2.2 {-}

In [None]:
class TanhNode(object):
    """Node tanh(a), where tanh is applied elementwise to the array a
        Parameters:
        a: node for which a.out is a numpy array
    """
    def __init__(self, a, node_name):
        """ 
        Parameters:
        a: node for which a.out is a numpy array
        node_name: node's name (a string)
        """
        self.a = a
        self.node_name = node_name
        self.out = None
        self.d_out = None

    def forward(self):
        self.out = np.tanh(self.a.out)
        self.d_out = np.zeros(self.out.shape)
        return self.out

    def backward(self):
        d_a = self.d_out * (1 - self.out**2)
        self.a.d_out += d_a
        return self.d_out

    def get_predecessors(self):
        return [self.a]

\newpage

## 4.2.3 {-}

In [None]:
class MLPRegression(BaseEstimator, RegressorMixin):
    """ MLP regression with computation graph """
    def __init__(self, num_hidden_units=10, step_size=.005, init_param_scale=0.01, max_num_epochs = 5000):
        self.num_hidden_units = num_hidden_units
        self.init_param_scale = 0.01
        self.max_num_epochs = max_num_epochs
        self.step_size = step_size

        # Build computation graph
        self.x = nodes.ValueNode(node_name="x") # to hold a vector input
        self.y = nodes.ValueNode(node_name="y") # to hold a scalar response
        self.b1 = nodes.ValueNode(node_name='b1')
        self.b2 = nodes.ValueNode(node_name='b2')
        self.W1 = nodes.ValueNode(node_name='W1')
        self.w2 = nodes.ValueNode(node_name='w2')
        self.L = nodes.AffineNode(W=self.W1, x=self.x, b=self.b1, node_name='L')
        self.h = nodes.TanhNode(a=self.L, node_name='L')
        self.prediction = nodes.AffineNode(W=self.w2, x=self.h, b=self.b2, node_name='prediction')
        self.objective = nodes.SquaredL2DistanceNode(self.y, self.prediction, node_name='objective')

        self.inputs = [self.x]
        self.outcomes = [self.y]
        self.parameters = [self.W1, self.b1, self.w2, self.b2]

        self.graph = graph.ComputationGraphFunction(self.inputs, self.outcomes,
                                                          self.parameters, self.prediction,
                                                          self.objective)

    def fit(self, X, y):
        num_instances, num_ftrs = X.shape
        y = y.reshape(-1)

        ## TODO: Initialize parameters (small random numbers -- not all 0, to break symmetry )
        s = self.init_param_scale
        init_values = {"W1": s*np.random.randn(self.num_hidden_units,num_ftrs), 
                       "b1": s*np.random.randn(self.num_hidden_units),
                       "b2": s*np.random.randn(1), 
                       "w2": s*np.random.randn(self.num_hidden_units)
                        }

        self.graph.set_parameters(init_values)

        for epoch in range(self.max_num_epochs):
            shuffle = np.random.permutation(num_instances)
            epoch_obj_tot = 0.0
            #pdb.set_trace()
            for j in shuffle:
                obj, grads = self.graph.get_gradients(input_values = {"x": X[j]},
                                                    outcome_values = {"y": np.array([y[j]])})
                #print(obj)
                epoch_obj_tot += obj
                # Take step in negative gradient direction
                steps = {}
                for param_name in grads:
                    steps[param_name] = -self.step_size * grads[param_name]
                    self.graph.increment_parameters(steps)

            if epoch % 50 == 0:
                train_loss = sum((y - self.predict(X,y)) **2)/num_instances
                print("Epoch ",epoch,": Ave objective=",epoch_obj_tot/num_instances," Ave training loss: ",train_loss)

    def predict(self, X, y=None):
        try:
            getattr(self, "graph")
        except AttributeError:
            raise RuntimeError("You must train classifer before predicting data!")

        num_instances = X.shape[0]
        preds = np.zeros(num_instances)
        for j in range(num_instances):
            preds[j] = self.graph.get_prediction(input_values={"x":X[j]})

        return preds

\newpage

<img src="4.2.3-mlp.png">

No features avg training loss:  0.21752394252822882

Featurized avg training loss:  0.10853926132523047

\newpage