<a href="https://colab.research.google.com/github/aethelind/notebooks-misc/blob/main/most_simplified_aaai_melding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## submodular.py

In [187]:
import torch
from torch.autograd import Variable
import numpy as np
import scipy as sp
import scipy.sparse
import scipy.linalg


class ContinuousOptimizer(torch.autograd.Function):
    """
    pytorch module for differentiable submodular maximization. The forward pass 
    computes the optimal x for given parameters. The backward pass differentiates 
    that optimal x wrt the parameters.
    """

    @staticmethod
    def forward(ctx, params):
        """
        Computes the optimal x using the supplied optimizer. 
        """
        with torch.enable_grad():
            x = optimize_coverage_multilinear(P=params, verbose=True, k=2, c=0.95, minibatch_size=None)
        ctx.save_for_backward(params, x) 
        return x.data

    @staticmethod
    def backward(ctx, grad_output):
        """
        Differentiates the optimal x returned by the forward pass with respect
        to the ratings matrix that was given as input.
        """
        print("df(x(theta_hat), theta) / CO grad_output", grad_output) ###
        params, x = ctx.saved_tensors 
        print("theta_hat / params", params) ###
        print("x(theta_hat) / x", x) ###
        xgrad = x.grad.data
        print("dx(theta_hat) / xgrad", xgrad) ###
        dxdr = ContinuousOptimizer.get_dxdr(x.data.detach().numpy(), -xgrad.detach().numpy(), params.detach().numpy(), dgrad_coverage, 0.95)
        print("dx(theta_hat)/dtheta_hat / dxdr", dxdr.shape) ###
        dxdr_t = torch.from_numpy(np.transpose(dxdr))
        print("dxdr_t", dxdr_t.shape) ###
        out = torch.mm(dxdr_t.float(), grad_output.view(len(x), 1))
        print("dfdx*dxdr / CO out", out.view_as(params))
        return out.view_as(params)

    @staticmethod
    def get_dxdr(x, grad, params, get_dgradf_dparams, max_x):
        '''
        Returns the derivative of the optimal solution in the region around x in 
        terms of the rating matrix r. 

        x: an optimal solution

        grad: df/dx at x

        params: the current parameter settings
        '''
        n = len(x)
        # first get the optimal dual variables via the KKT conditions
        # dual variable for constraint sum(x) <= k
        if np.logical_and(x > 0, x < max_x).any():
            lambda_sum = np.mean(grad[np.logical_and(x > 0, x < max_x)])
        else:
            lambda_sum = 0
        # dual variable for constraint x <= max_x
        lambda_upper = []
        # dual variable for constraint x >= 0
        lambda_lower = []
        for i in range(n):
            if np.abs(x[i] - max_x) < 0.000001:
                lambda_upper.append(grad[i] - lambda_sum)
            else:
                lambda_upper.append(0)
            if x[i] > 0:
                lambda_lower.append(0)
            else:
                lambda_lower.append(grad[i] - lambda_sum)
        # number of constraints
        m = 2*n + 1
        # collect value of dual variables
        lam = np.zeros((m))
        lam[0] = lambda_sum
        lam[1:(n+1)] = lambda_upper
        lam[n+1:] = lambda_lower
        diag_lambda = np.matrix(np.diag(lam))
        # collect value of constraints
        g = np.zeros((m))
        # TODO: replace the second x.sum() with k so that this is actually generally correct
        g[0] = x.sum() - x.sum()
        g[1:(n+1)] = x - max_x
        g[n+1:] = -x
        diag_g = np.matrix(np.diag(g))
        # gradient of constraints wrt x
        dgdx = np.zeros((m, n))
        # gradient of constraint sum(x) <= k
        dgdx[0, :] = 1
        # gradient of constraints x <= 1
        for i in range(1, n+1):
            dgdx[i, i-1] = 1
        # gradient of constraints x >= 0 <--> -x <= 0
        for i in range(n+1, m):
            dgdx[i, i-(n+1)] = -1
        dgdx = np.matrix(dgdx)
        # the Hessian matrix -- all zeros for now
        H = np.matrix(np.zeros((n, n)))
        # coefficient matrix for the linear system
        A = np.bmat([[H, np.transpose(dgdx)], [diag_lambda*dgdx, diag_g]])
        # add 0.01*I to improve conditioning
        A = A + 0.01*np.eye(n+m)
        # RHS of the linear system, mostly partial derivative of grad f wrt params
        dgradf_dparams = get_dgradf_dparams(x, params, num_samples=1000)
        reshaped = np.zeros(
            (dgradf_dparams.shape[0], dgradf_dparams.shape[1]*dgradf_dparams.shape[2]))
        for i in range(n):
            reshaped[i] = dgradf_dparams[i].flatten()
        b = np.bmat([[reshaped], [np.zeros((m, reshaped.shape[1]))]])
        # solution to the system
        derivatives = sp.linalg.solve(A, b)
        if np.isnan(derivatives).any():
            print('report')
            print(np.isnan(A).any())
            print(np.isnan(b).any())
            print(np.isnan(dgdx).any())
            print(np.isnan(diag_lambda).any())
            print(np.isnan(diag_g).any())
            print(np.isnan(dgradf_dparams).any())
        # first n are derivatives of primal variables
        derivatives = derivatives[:n]
        return derivatives


## coverage.py

In [106]:
import torch
import numpy as np
from numba import jit


@jit
def gradient_coverage(x, P):
    n = P.shape[1]
    m = len(x)
    grad = np.zeros(m, dtype=np.float32)
    for i in range(n):
        p_fail = 1 - x*P[:, i]
        p_all_fail = np.prod(p_fail)
        for j in range(m):
            grad[j] += P[j, i] * p_all_fail/p_fail[j]
    return grad


@jit
def objective_coverage(x, P):
    n = P.shape[1]
    total = 0
    for i in range(n):
        p_fail = 1 - x*P[:, i]
        p_all_fail = np.prod(p_fail)
        total += (1 - p_all_fail)
    return total


class CoverageInstanceMultilinear(torch.autograd.Function):
    """
    Represents a coverage instance with given coverage probabilities
    P. Forward pass computes the objective value (if evaluate_forward
    is true). Backward computes the gradients w.r.t. decision variables x.
    """
    @staticmethod
    def forward(ctx, x, P):
        ctx.save_for_backward(x, P)
        out = objective_coverage(x.detach().numpy(), P.detach().numpy())
        ### print("objective_coverage CoverageInstanceMultilinear forward out", torch.tensor(out).float()) ###
        return torch.tensor(out).float()

    @staticmethod
    def backward(ctx, grad_in):
        ## print("grad_in", grad_in) ###
        x, P = ctx.saved_tensors
        grad = gradient_coverage(x.detach().numpy(), P.detach().numpy())
        ### print("gradient_coverage CoverageInstanceMultilinear backward out", torch.from_numpy(grad).float()) ###
        return torch.from_numpy(grad).float()*grad_in.float(), None


def optimize_coverage_multilinear(P, verbose=True, k=10, c=1., minibatch_size=None):
    '''
    Run some variant of SGD for the coverage problem with given 
    coverage probabilities P.
    '''
    # decision variables
    x = torch.zeros(P.shape[0], requires_grad=True)
    # set up the optimizer
    optimizer = torch.optim.SGD([x], momentum=0.9, lr=0.1, nesterov=True)
    # take projected stochastic gradient steps
    for t in range(10):
        # objective which will provide gradient evaluations
        loss = -CoverageInstanceMultilinear.apply(x, P)
        if verbose:
            print(t, -loss.item())
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
        x.data = torch.from_numpy(project_uniform_matroid_boundary(x.data.numpy(), k, 1/c)).float()
        ### print("x", x) ###
    return x


@jit
def dgrad_coverage(x, P, num_samples):
    n = P.shape[1]
    m = len(x)
    dgrad = np.zeros((m, m, n), dtype=np.float32)
    for i in range(n):
        p_fail = 1 - x*P[:,i]
        p_all_fail = np.prod(p_fail)
        for j in range(m):
            for k in range(m):
                if j == k:
                    dgrad[j, k, i] = p_all_fail/p_fail[j]
                else:
                    dgrad[j, k, i] = -x[k] * P[j, i] * p_all_fail/(p_fail[j] * p_fail[k])
    return dgrad

## utils.py

In [107]:
def project_uniform_matroid_boundary(x, k, c=1):
    '''
    Exact projection algorithm of Karimi et al. This is the projection implementation
    that should be used now.
    
    Projects x onto the set {y: 0 <= y <= 1/c, ||y||_1 = k}
    '''
    import numpy as np
    k *= c
    n = len(x)
    x = x.copy()
    alpha_upper = x/c
    alpha_lower = (x*c - 1)/c**2
    S = []
    S.extend(alpha_lower)
    S.extend(alpha_upper)
    S.sort()
    S = np.unique(S)
    h = n
    alpha = min(S) - 1
    m = 0
    for i in range(len(S)):
        hprime = h + (S[i] - alpha)*m
        if hprime < k and k <= h:
            alphastar = (S[i] - alpha)*(h - k)/(h - hprime) + alpha
            result = np.zeros((n))
            for j in range(n):
                if alpha_lower[j] > alphastar:
                    result[j] = 1./c
                elif alpha_upper[j] >= alphastar:
                    result[j] = x[j] - alphastar*c
            return result
        m -= (alpha_lower == S[i]).sum()*(c**2)
        m += (alpha_upper == S[i]).sum()*(c**2)
        h = hprime
        alpha = S[i]
    raise Exception('projection did not terminate')

In [1]:
# Clear out directory
!rm -rf *
# Download data_decisions_benchmarks.zip and unzip diverse_recommendation_data.pickle
!curl https://bryanwilder.github.io/files/data_decisions_benchmarks.zip | jar xv benchmarks_release/diverse_recommendation_data.pickle
# Move diverse_recommendation_data.pickle to current directory
!mv benchmarks_release/diverse_recommendation_data.pickle .
# Remove empty directory
!rm -rf benchmarks_release
# Download hetrec2011-movielens-2k-v2.zip and unzip movie_actors.dat and user_ratedmovies.dat
!curl https://files.grouplens.org/datasets/hetrec2011/hetrec2011-movielens-2k-v2.zip | jar xv movie_actors.dat user_ratedmovies.dat

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 83.0M  100 83.0M    0     0  20.8M      0  0:00:03  0:00:03 --:--:-- 20.8M
 inflated: benchmarks_release/diverse_recommendation_data.pickle
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0 inflated: movie_actors.dat
 20 17.9M   20 3808k    0     0  6201k      0  0:00:02 --:--:--  0:00:02 6201k inflated: user_ratedmovies.dat
100 17.9M  100 17.9M    0     0  15.5M      0  0:00:01  0:00:01 --:--:-- 15.5M


In [31]:
## recommendation_nn_decision.py
## movie problem
import numpy as np
import torch
import pickle
from functools import partial
import torch.nn as nn
import random

num_layers = 1
activation = 'relu'
#k = 20
#use_hessian = False
num_iters = 100
instance_sizes = [100]
learning_rate = 1e-4

Ps = {}
data = {}
f_true = {}
for num_items in instance_sizes:
    with open('diverse_recommendation_data' + '.pickle', 'rb') as f:
        # Ps_size takes on the actors 100x500 and data_size takes the users 100x2113
        Ps_size, data_size = pickle.load(f)

    num_targets = Ps_size[0].shape[1]
    num_features = data_size[0].shape[1]
    Ps[num_items] = [torch.from_numpy(P).long() for P in Ps_size]
    data[num_items] = [torch.from_numpy(x).float() for x in data_size]
    w = np.ones(num_targets, dtype=np.float32)
    f_true[num_items] = [(P, w) for P in Ps[num_items]]


num_repetitions = 0

train = {}
test = {}
for size in instance_sizes:
    with open('diverse_recommendation_data' + '.pickle', 'rb') as f:
        train[size], test[size] = pickle.load(f)


In [None]:
class CoverageInstanceMultilinear(torch.autograd.Function):
    """
    Represents a coverage instance with given coverage probabilities
    P. Forward pass computes the objective value (if evaluate_forward
    is true). Backward computes the gradients w.r.t. decision variables x.
    """
    @staticmethod
    def forward(ctx, x, P):
        ctx.save_for_backward(x, P)
        out = objective_coverage(x.detach().numpy(), P.detach().numpy())
        ### print("objective_coverage CoverageInstanceMultilinear forward out", torch.tensor(out).float()) ###
        return torch.tensor(out).float()

    @staticmethod
    def backward(ctx, grad_in):
        ## print("grad_in", grad_in) ###
        x, P = ctx.saved_tensors
        grad = gradient_coverage(x.detach().numpy(), P.detach().numpy())
        ### print("gradient_coverage CoverageInstanceMultilinear backward out", torch.from_numpy(grad).float()) ###
        return torch.from_numpy(grad).float()*grad_in.float(), None

# Train

In [180]:
vals = np.zeros((num_repetitions+30, len(instance_sizes), len(instance_sizes)))

for idx in range(num_repetitions, num_repetitions + 30):

    intermediate_size = 200
    def make_fc():
        if num_layers > 1:
            if activation == 'relu':
                activation_fn = nn.ReLU
            elif activation == 'sigmoid':
                activation_fn = nn.Sigmoid
            else:
                raise Exception(
                    'Invalid activation function: ' + str(activation))
            net_layers = [
                nn.Linear(num_features, intermediate_size), activation_fn()]
            for hidden in range(num_layers-2):
                net_layers.append(
                    nn.Linear(intermediate_size, intermediate_size))
                net_layers.append(activation_fn())
            net_layers.append(nn.Linear(intermediate_size, num_targets))
            net_layers.append(nn.Sigmoid())
            return nn.Sequential(*net_layers)
        else:
            return nn.Sequential(nn.Linear(num_features, num_targets, bias=False)) ##nn.Sequential(nn.Linear(num_features, num_targets), nn.Sigmoid())

    # runs the given net on instances of a given size
    def eval_opt(net, instances, size):
        net.eval()
        val = 0.
        for i in range(len(instances)):
            pred = net(data[size][i])
            x = ContinuousOptimizer.apply(pred)
            pp, _ = f_true[size][i] #audrey fix
            val += objective_coverage(x.detach().numpy(), pp.detach().numpy()) #audrey fix
        net.train()
        return val/len(instances)

    # train a network for each size, and test on each sizes
    for train_idx, train_size in enumerate(instance_sizes):
        net = make_fc()
        optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
        # training
        for t in range(num_iters):
            print(f"Iteration {t}")

            print("Get the model predictions...") ###
            i = random.randint(0, 80)
            y = data[train_size][0]
            ####
            init_weights = net.get_submodule("0").get_parameter('weight') ###
            #init_bias = net.get_submodule("0").get_parameter('bias') ###
            print("w / weights", init_weights)###
            #print("w / bias", init_bias)###
            ####
            pred = net(y)
            print("theta_hat / pred", pred) ###
            print("\n")###

            print("Get the optimal solution to the continous problem...")
            x = ContinuousOptimizer.apply(pred)
            print("x(theta_hat) / train x", x)###
            print("\n")###

            print("Get the objective value and set as the loss...")
            pp, _ = f_true[train_size][0]
            loss = -CoverageInstanceMultilinear.apply(x, pp)
            print("f(x(theta_hat), theta) / train loss", loss)###
            print("\n")###

            print("Update model weights based on the computed gradients...")
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            print("initial weights", init_weights)###
            print("weights.grad", init_weights.grad)###
            #print("initial bias", init_bias)###
            #print("bias.grad", init_bias.grad)###
            print("\n")###

            print("Call optimizer.step()...")
            optimizer.step()
            weights = net.get_submodule("0").get_parameter('weight') ###
            #bias = net.get_submodule("0").get_parameter('bias') ###
            print("updated weights", weights)###
            print("weights.grad", weights.grad)###
            #print("updated bias", bias)###
            #print("bias.grad", bias.grad)###
            print("\n")###
            break
        break
        # save learned network state
        savepath = '/tmp/net_diffopt_smalllr_{0}_{1}_{2}_{3}.pt'.format(  #audrey fixes /tmp/
            train_size, 2, num_layers, idx)
        torch.save(net.state_dict(), savepath)
        # test on different sizes
        for test_idx, test_size in enumerate(instance_sizes):
            vals[idx, train_idx, test_idx] = eval_opt(
                net, test, test_size)
            print(vals[idx, train_idx, test_idx])
        # save out values
        print(idx, train_size, vals[idx, train_idx])
        with open('results_recommendation_' + str(num_layers) + '.pickle', 'wb') as f:
            pickle.dump(vals, f)
    break


Iteration 0
Get the model predictions...
w / weights Parameter containing:
tensor([[ 0.6078],
        [ 0.1487],
        [-0.2502],
        [-0.4783]], requires_grad=True)
theta_hat / pred tensor([[ 2.2490,  0.5502, -0.9257, -1.7698],
        [ 1.8235,  0.4461, -0.7506, -1.4350],
        [ 4.1940,  1.0260, -1.7264, -3.3004],
        [ 2.1274,  0.5204, -0.8757, -1.6741],
        [ 1.3372,  0.3271, -0.5504, -1.0523],
        [ 1.7019,  0.4164, -0.7006, -1.3393],
        [ 1.4588,  0.3569, -0.6005, -1.1480],
        [ 2.7352,  0.6691, -1.1259, -2.1525],
        [ 2.6137,  0.6394, -1.0759, -2.0568],
        [ 1.5804,  0.3866, -0.6505, -1.2436],
        [ 1.9451,  0.4758, -0.8006, -1.5306],
        [ 3.2823,  0.8030, -1.3511, -2.5829],
        [ 0.7294,  0.1784, -0.3002, -0.5740],
        [ 2.4921,  0.6097, -1.0258, -1.9611],
        [ 2.0666,  0.5056, -0.8507, -1.6263],
        [ 2.6137,  0.6394, -1.0759, -2.0568],
        [ 2.4313,  0.5948, -1.0008, -1.9133],
        [ 2.0058,  0.4907, -0

In [181]:
#check remainder of chain rule

y = data[train_size][0]
dfdx_dxdr = torch.tensor([[-2.2791e-03, -5.5432e-03, -9.9315e-03, -1.3192e-02],
        [-1.4681e-03, -3.5707e-03, -6.3976e-03, -8.4981e-03],
        [-1.6610e-03, -4.0398e-03, -7.2380e-03, -9.6145e-03],
        [-1.2060e-03, -2.9331e-03, -5.2552e-03, -6.9807e-03],
        [ 5.5862e-05,  1.3586e-04,  2.4343e-04,  3.2335e-04],
        [ 3.9903e-05,  9.7050e-05,  1.7388e-04,  2.3097e-04],
        [ 4.9290e-05,  1.1988e-04,  2.1479e-04,  2.8531e-04],
        [-1.7993e-03, -4.3762e-03, -7.8408e-03, -1.0415e-02],
        [-1.8993e-03, -4.6193e-03, -8.2763e-03, -1.0994e-02],
        [ 4.4102e-05,  1.0726e-04,  1.9218e-04,  2.5528e-04],
        [-1.3507e-03, -3.2851e-03, -5.8858e-03, -7.8183e-03],
        [-2.1911e-03, -5.3291e-03, -9.5480e-03, -1.2683e-02],
        [ 7.8698e-02,  6.4586e-02, -1.5494e-03, -6.6927e-02],
        [-2.0110e-03, -4.8911e-03, -8.7632e-03, -1.1640e-02],
        [-1.2506e-03, -3.0418e-03, -5.4498e-03, -7.2392e-03],
        [-1.8993e-03, -4.6193e-03, -8.2763e-03, -1.0994e-02],
        [-2.0719e-03, -5.0393e-03, -9.0287e-03, -1.1993e-02],
        [-1.2987e-03, -3.1587e-03, -5.6594e-03, -7.5176e-03],
        [-1.2506e-03, -3.0418e-03, -5.4498e-03, -7.2392e-03],
        [ 4.9290e-05,  1.1988e-04,  2.1479e-04,  2.8531e-04],
        [ 7.8698e-02,  6.4586e-02, -1.5495e-03, -6.6927e-02],
        [-1.7993e-03, -4.3762e-03, -7.8408e-03, -1.0415e-02],
        [-2.0719e-03, -5.0393e-03, -9.0287e-03, -1.1993e-02],
        [-2.4519e-03, -5.9635e-03, -1.0685e-02, -1.4193e-02],
        [ 5.2697e-02,  3.7223e-02, -9.4989e-02, -2.4443e-01],
        [-2.1367e-03, -5.1967e-03, -9.3108e-03, -1.2368e-02]])

In [184]:
yo = dfdx_dxdr*y
yo

tensor([[-0.0084, -0.0205, -0.0367, -0.0488],
        [-0.0044, -0.0107, -0.0192, -0.0255],
        [-0.0115, -0.0279, -0.0499, -0.0663],
        [-0.0042, -0.0103, -0.0184, -0.0244],
        [ 0.0001,  0.0003,  0.0005,  0.0007],
        [ 0.0001,  0.0003,  0.0005,  0.0006],
        [ 0.0001,  0.0003,  0.0005,  0.0007],
        [-0.0081, -0.0197, -0.0353, -0.0469],
        [-0.0082, -0.0199, -0.0356, -0.0473],
        [ 0.0001,  0.0003,  0.0005,  0.0007],
        [-0.0043, -0.0105, -0.0188, -0.0250],
        [-0.0118, -0.0288, -0.0516, -0.0685],
        [ 0.0944,  0.0775, -0.0019, -0.0803],
        [-0.0082, -0.0201, -0.0359, -0.0477],
        [-0.0043, -0.0103, -0.0185, -0.0246],
        [-0.0082, -0.0199, -0.0356, -0.0473],
        [-0.0083, -0.0202, -0.0361, -0.0480],
        [-0.0043, -0.0104, -0.0187, -0.0248],
        [-0.0043, -0.0103, -0.0185, -0.0246],
        [ 0.0001,  0.0003,  0.0005,  0.0007],
        [ 0.0944,  0.0775, -0.0019, -0.0803],
        [-0.0081, -0.0197, -0.0353

In [185]:
yo.sum(dim=0)

tensor([ 0.0675, -0.1630, -0.6139, -1.0007])

In [186]:
weights.grad

tensor([[ 0.0675],
        [-0.1630],
        [-0.6139],
        [-1.0007]])

In [176]:
optimizer.state_dict()

{'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 0.0031],
           [-0.0094],
           [-0.0034],
           [-0.0029]]), 'exp_avg_sq': tensor([[9.4479e-07],
           [8.8546e-06],
           [1.1807e-06],
           [8.5244e-07]])}},
 'param_groups': [{'lr': 0.0001,
   'betas': (0.9, 0.999),
   'eps': 1e-08,
   'weight_decay': 0,
   'amsgrad': False,
   'maximize': False,
   'foreach': None,
   'capturable': False,
   'params': [0]}]}

In [138]:
pred

tensor([[0.3678, 0.7545, 0.9585, 0.3749],
        [0.3766, 0.7160, 0.9286, 0.4366],
        [0.3290, 0.8838, 0.9969, 0.1566],
        [0.3703, 0.7439, 0.9514, 0.3922],
        [0.3867, 0.6678, 0.8709, 0.5095],
        [0.3791, 0.7044, 0.9169, 0.4547],
        [0.3841, 0.6802, 0.8882, 0.4912],
        [0.3579, 0.7940, 0.9780, 0.3091],
        [0.3604, 0.7846, 0.9742, 0.3249],
        [0.3816, 0.6924, 0.9035, 0.4729],
        [0.3741, 0.7274, 0.9387, 0.4187],
        [0.3470, 0.8326, 0.9894, 0.2434],
        [0.3994, 0.6023, 0.7482, 0.5997],
        [0.3629, 0.7749, 0.9697, 0.3412],
        [0.3716, 0.7385, 0.9475, 0.4009],
        [0.3604, 0.7846, 0.9742, 0.3249],
        [0.3641, 0.7699, 0.9672, 0.3495],
        [0.3728, 0.7330, 0.9433, 0.4098],
        [0.3716, 0.7385, 0.9475, 0.4009],
        [0.3841, 0.6802, 0.8882, 0.4912],
        [0.3994, 0.6023, 0.7482, 0.5997],
        [0.3579, 0.7940, 0.9780, 0.3091],
        [0.3641, 0.7699, 0.9672, 0.3495],
        [0.3530, 0.8119, 0.9841, 0

In [126]:
init_weights.grad.data

tensor([[-5.7900],
        [-0.9961],
        [-0.6959],
        [-2.0019]])

In [132]:
print('init_weights:')
print(init_weights.grad_fn)
print(init_weights.grad_fn.next_functions)
print(init_weights.grad_fn.next_functions[0][0].next_functions)
print(init_weights.grad_fn.next_functions[0][0].next_functions[0][0].next_functions)
print(init_weights.grad_fn.next_functions[0][0].next_functions[0][0].next_functions[0])

init_weights:
None


AttributeError: ignored

In [129]:
init_weights.grad_fn

In [101]:
##predictions match calculation
weights = net.get_submodule("0").get_parameter('weight')
bias = net.get_submodule("0").get_parameter('bias')
print("weights", weights)###
print("bias", bias)###
print("does prediction match pred sigmoid", torch.sigmoid(data[train_size][0]*weights[0]+bias[0]))
#print("does prediction match pred", data[train_size][0]*weights[0]+bias[0])
pred = net(data[train_size][0])
print("pred", pred)

weights Parameter containing:
tensor([[ 0.1862],
        [ 0.3591],
        [-0.9775],
        [-0.8398]], requires_grad=True)
bias Parameter containing:
tensor([-0.5743,  0.1472,  0.8371, -0.5253], requires_grad=True)
does prediction match pred sigmoid tensor([[0.5286],
        [0.4961],
        [0.6705],
        [0.5193],
        [0.4589],
        [0.4868],
        [0.4682],
        [0.5655],
        [0.5563],
        [0.4775],
        [0.5054],
        [0.6061],
        [0.4132],
        [0.5471],
        [0.5147],
        [0.5563],
        [0.5425],
        [0.5100],
        [0.5147],
        [0.4682],
        [0.4132],
        [0.5655],
        [0.5425],
        [0.5837],
        [0.3710],
        [0.5379]], grad_fn=<SigmoidBackward0>)
pred tensor([[0.5286, 0.8140, 0.0584, 0.0258],
        [0.4961, 0.7729, 0.1095, 0.0454],
        [0.6705, 0.9325, 0.0027, 0.0018],
        [0.5193, 0.8028, 0.0702, 0.0303],
        [0.4589, 0.7185, 0.2119, 0.0853],
        [0.4868, 0.7600, 0.1301, 0

In [68]:
pred.grad

  return self._grad


In [2]:
## recommendation_nn_decision.py
## synthetic problem
import numpy as np
import torch
import pickle
from functools import partial
import torch.nn as nn
import random

# load probability matrix 
P_list = [
[	0	,	1	,	0	,	1	],
[	0	,	1	,	0	,	0	],
[	1	,	1	,	0	,	1	],
[	0	,	1	,	0	,	0	],
[	0	,	0	,	0	,	0	],
[	0	,	0	,	0	,	0	],
[	0	,	0	,	0	,	0	],
[	0	,	1	,	0	,	1	],
[	0	,	1	,	0	,	1	],
[	0	,	0	,	0	,	0	],
[	0	,	1	,	0	,	0	],
[	1	,	1	,	0	,	1	],
[	0	,	0	,	1	,	0	],
[	0	,	1	,	0	,	1	],
[	0	,	1	,	0	,	0	],
[	0	,	1	,	0	,	1	],
[	0	,	1	,	0	,	1	],
[	0	,	1	,	0	,	0	],
[	0	,	1	,	0	,	0	],
[	0	,	0	,	0	,	0	],
[	0	,	0	,	1	,	0	],
[	0	,	1	,	0	,	1	],
[	0	,	1	,	0	,	1	],
[	1	,	1	,	0	,	1	],
[	0	,	0	,	1	,	0	],
[	0	,	1	,	0	,	1	],
]

# load features
circuit_km = [
3.7, 
3, 
6.9, 
3.5, 
2.2, 
2.8, 
2.4, 
4.5, 
4.3, 
2.6, 
3.2, 
5.4, 
1.2, 
4.1, 
3.4, 
4.3, 
4, 
3.3, 
3.4, 
2.4, 
1.2, 
4.5, 
4, 
4.9, 
0.25, 
3.9, 
]
y = []
for i in circuit_km:
  y.append([i])

num_layers = 1
activation = 'relu'
#k = 2
#use_hessian = False
num_iters = 100
instance_sizes = [0]
learning_rate = 1e-4

Ps = {}
data = {}
f_true = {}

for num_items in instance_sizes:
    Ps_size = np.array(P_list)
    data_size = np.array(y)

    num_targets = Ps_size.shape[1] #500 --> 4
    num_features = data_size.shape[1] #2113 --> 1
    Ps[num_items] = [torch.from_numpy(Ps_size).long()]
    data[num_items] = [torch.from_numpy(data_size).float()]
    w = np.ones(num_targets, dtype=np.float32)
    f_true[num_items] = [(P, w) for P in Ps[num_items]]
  
num_repetitions = 0

train = {}
test = {}
for size in instance_sizes:
  train[size], test[size] = np.array(P_list), np.array(y)

In [5]:
Ps[0][0].shape

torch.Size([26, 4])