<a href="https://colab.research.google.com/github/arunraja-hub/Preference_Extraction/blob/code_de_dup/find_subnets_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [1]:
%tensorflow_version 2.x

!git clone https://github.com/arunraja-hub/Preference_Extraction.git

fatal: destination path 'Preference_Extraction' already exists and is not an empty directory.


In [0]:
from __future__ import print_function
import argparse
import os
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms.functional as TF
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.autograd as autograd
from torchsummary import summary

from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
import tensorflow as tf
import concurrent.futures
import itertools
import os
import random
import sys
import time
import re
import io

import sys

sys.path.append('Preference_Extraction')
from imports_data import all_load_data

In [0]:
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Subnets Methods

In [0]:
"""
    Original code from What's hidden in a randomly weighted neural network? paper
    Implemented at https://github.com/allenai/hidden-networks
    Remove weigths-initialisation since it is not relevant for us
"""

class GetSubnet(autograd.Function):
    @staticmethod
    def forward(ctx, scores, k):
        # Get the supermask by sorting the scores and using the top k%
        out = scores.clone()
        _, idx = scores.flatten().sort()
        j = int((1 - k) * scores.numel())

        # flat_out and out access the same memory.
        flat_out = out.flatten()
        flat_out[idx[:j]] = 0
        flat_out[idx[j:]] = 1

        return out

    @staticmethod
    def backward(ctx, g):
        # send the gradient g straight-through on the backward pass.
        return g, None

class SupermaskConv(nn.Conv2d):
    def __init__(self, *args, k, **kwargs):
        super().__init__(*args, **kwargs)
        self.k = k

        # initialize the scores
        self.scores = nn.Parameter(torch.Tensor(self.weight.size()))
        nn.init.kaiming_uniform_(self.scores, a=math.sqrt(5))

        # initialize the weights
        nn.init.uniform_(self.weight)
        
        # NOTE: turn the gradient on the weights off
        self.weight.requires_grad = False

    def forward(self, x):
        subnet = GetSubnet.apply(self.scores.abs(), self.k)
        w = self.weight * subnet
        x = F.conv2d(
            x, w, self.bias, self.stride, self.padding, self.dilation, self.groups
        )
        return x

class SupermaskLinear(nn.Linear):
    def __init__(self, *args, k, **kwargs):
        super().__init__(*args, **kwargs)
        self.k = k

        # initialize the scores
        self.scores = nn.Parameter(torch.Tensor(self.weight.size()))
        nn.init.kaiming_uniform_(self.scores, a=math.sqrt(5))

        nn.init.uniform_(self.weight)

        # NOTE: turn the gradient on the weights off
        self.weight.requires_grad = False

    def forward(self, x):
        subnet = GetSubnet.apply(self.scores.abs(), self.k)
        w = self.weight * subnet
        return F.linear(x, w, self.bias)
        return x

# NOTE: not used here but we use NON-AFFINE Normalization!
# So there is no learned parameters for your nomralization layer.
class NonAffineBatchNorm(nn.BatchNorm2d):
    def __init__(self, dim):
        super(NonAffineBatchNorm, self).__init__(dim, affine=False)

## Define Supermask Network

In [5]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
device

device(type='cuda')

## Load Data

In [0]:
all_raw_data = all_load_data("Preference_Extraction/data/simple_env_1/")

In [0]:
activations = []
observations = []
preferences = []

for data in all_raw_data:
    for i in range(data.observation.shape[0]):
        observations.append(np.copy(data.observation[i]))
        activations.append(np.copy(data.policy_info["activations"][i]))
        preferences.append((data.policy_info['satisfaction'].as_list()[i] > -6).astype(int))

activations = np.array(activations)
observations = np.array(observations)
preferences = np.array(preferences)

## Define architecture

In [0]:
"""
    Class modified from above to become enseble of Qhead subnetwrosk
"""

class SuperMaskQNets(nn.Module):
    def __init__(self, k, q_head_index, q_means_stds):
        super(SuperMaskQNets, self).__init__()
        self.conv1 = SupermaskConv(in_channels=5, out_channels=16, kernel_size=3, stride=1, bias=True, k=k)
        self.conv2 = SupermaskConv(in_channels=16, out_channels=32, kernel_size=3, stride=2, bias=True, k=k)
        self.fc1 = SupermaskLinear(in_features=960, out_features=64, bias=True, k=k)
        self.fc2 = SupermaskLinear(in_features=64, out_features=3, bias=True, k=k)

        assert q_head_index < 3, 'Model has only 3 qHeads'
        self.qix = q_head_index
        self.qu_mu_s = q_means_stds

    def fwd_conv1(self, x):
        x = self.conv1(x)
        return F.relu(x)

    def fwd_conv2(self, x):
        x = self.fwd_conv1(x)
        x = self.conv2(x)
        return F.relu(x)

    def fwd_flat(self, x):
        x = self.fwd_conv2(x)
        return torch.flatten(torch.transpose(x, 1, 3), 1) # Pre-flattening transpose is necessary for TF-Torch conversion

    def fwd_fc1(self, x):
        x = self.fwd_flat(x)
        x = self.fc1(x)
        return F.relu(x)
    
    def fwd_fc2(self, x):
        x = self.fwd_fc1(x)
        return self.fc2(x)

    def forward(self, x):
        x = self.fwd_fc2(x)[: ,self.qix]
        x -= self.qu_mu_s[self.qix][0]
        x /= self.qu_mu_s[self.qix][1]
        return torch.sigmoid(x)

## Loading Weights

In [9]:
new_save_path = "Preference_Extraction/saved_model2"
restored_model = tf.keras.models.load_model(new_save_path)
restored_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EncodingNetwork/conv2d (Conv (None, 12, 14, 16)        736       
_________________________________________________________________
EncodingNetwork/conv2d_1 (Co (None, 5, 6, 32)          4640      
_________________________________________________________________
flatten (Flatten)            (None, 960)               0         
_________________________________________________________________
EncodingNetwork/dense (Dense (None, 64)                61504     
_________________________________________________________________
dense (Dense)                (None, 3)                 195       
Total params: 67,075
Trainable params: 67,075
Non-trainable params: 0
_________________________________________________________________


In [0]:
original_weights=restored_model.get_weights()

In [0]:
def load_weights(model):
  model.conv1.weight.data = torch.from_numpy(np.transpose(original_weights[0]))
  model.conv1.bias.data = torch.from_numpy(original_weights[1])
  model.conv2.weight.data = torch.from_numpy(np.transpose(original_weights[2]))
  model.conv2.bias.data = torch.from_numpy(original_weights[3])
  model.fc1.weight.data = torch.from_numpy(np.transpose(original_weights[4]))
  model.fc1.bias.data = torch.from_numpy(original_weights[5])
  model.fc2.weight.data = torch.from_numpy(np.transpose(original_weights[6]))
  model.fc2.bias.data = torch.from_numpy(original_weights[7])
  model.to(device)

In [0]:
supermask_test_model = SuperMaskQNets(k=1, q_head_index=0, q_means_stds=[0, 1]).to(device)
load_weights(supermask_test_model)

## Test the weights loaded properly

In [0]:
# Comparing that the models have identical observations for identical images
tf_conv1_fn = tf.keras.models.Model(inputs=restored_model.input, outputs=restored_model.layers[0].output)
tf_conv2_fn = tf.keras.models.Model(inputs=restored_model.input, outputs=restored_model.layers[1].output)
tf_flt_fn = tf.keras.models.Model(inputs=restored_model.input, outputs=restored_model.layers[2].output)
tf_fc1_fn = tf.keras.models.Model(inputs=restored_model.input, outputs=restored_model.layers[3].output)

for i in range(len(all_raw_data[0].observation)):

    single_observation = np.array([all_raw_data[0].observation[i]])
    single_observation_torch = torch.Tensor(np.array([np.transpose(all_raw_data[0].observation[i])]))
    single_observation_torch = single_observation_torch.to(device)
    
    conv1_torch_out = np.transpose(supermask_test_model.fwd_conv1(single_observation_torch).detach().cpu().numpy())
    conv1_torch_out = conv1_torch_out.reshape(conv1_torch_out.shape[:-1])
    conv1_tf_out = tf_conv1_fn(single_observation)[0].numpy()
    np.testing.assert_allclose(conv1_torch_out, conv1_tf_out, rtol=.1)

    conv2_torch_out = np.transpose(supermask_test_model.fwd_conv2(single_observation_torch).detach().cpu().numpy())
    conv2_torch_out = conv2_torch_out.reshape(conv2_torch_out.shape[:-1])
    conv2_tf_out = tf_conv2_fn(single_observation)[0].numpy()
    np.testing.assert_allclose(conv2_torch_out, conv2_tf_out, rtol=.1)

    flt_torch_out = np.transpose(supermask_test_model.fwd_flat(single_observation_torch).detach().cpu().numpy())
    flt_torch_out = flt_torch_out.reshape(flt_torch_out.shape[:-1])
    tf_flt_out = tf_flt_fn(single_observation)[0].numpy()
    np.testing.assert_allclose(flt_torch_out, tf_flt_out, rtol=.1)

    fc1_torch_out = np.transpose(supermask_test_model.fwd_fc1(single_observation_torch).detach().cpu().numpy())
    fc1_torch_out = fc1_torch_out.reshape(fc1_torch_out.shape[:-1])
    fc1_tf_out = tf_fc1_fn(single_observation)[0].numpy()
    
    old_activations = all_raw_data[0].policy_info["activations"][i]

    np.testing.assert_allclose(fc1_torch_out, fc1_tf_out, rtol=.1)
    np.testing.assert_allclose(fc1_torch_out, old_activations, rtol=.1)
    np.testing.assert_allclose(old_activations, fc1_tf_out, rtol=.1)

## Create models for each q net head. And load weights

In [0]:
# Create dataset iterators
num_train = 50
num_val = 400
batch_size = 10
val_batch_size = 10

xs = np.rollaxis(observations, 3, 1) # Torch wants channel-first
ys = preferences
xs, ys = shuffle(xs, ys)

xs_tr = xs[:num_train]
ys_tr = ys[:num_train]
xs_val = xs[num_train:num_train+num_val]
ys_val = ys[num_train:num_train+num_val]

tr_data_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(torch.Tensor(xs_tr), torch.Tensor(ys_tr)),
    batch_size=batch_size)

val_data_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(torch.Tensor(xs_val), torch.Tensor(ys_val)),
    batch_size=val_batch_size)

In [15]:
def get_q_heads_mu_and_sigma(model, all_obs, num_obs):
    
    model.eval()

    all_obs = shuffle(all_obs)
    obs_to_pass = all_obs[:num_obs]

    obs_tensor = torch.Tensor(obs_to_pass)
    obs_tensor = obs_tensor.to(device)
    qheads_values = model.fwd_fc2(obs_tensor).detach().cpu().numpy()

    mu = qheads_values.mean(axis=0)
    s = qheads_values.std(axis=0)

    qheads_mu_s = {}
    for qix in range(len(mu)):
        qheads_mu_s[qix] = (mu[qix], s[qix])
    
    return qheads_mu_s

q_mu_s = get_q_heads_mu_and_sigma(supermask_test_model, xs, 10000)
q_mu_s

{0: (92.63736, 47.492996), 1: (68.10719, 50.8589), 2: (138.20757, 77.337135)}

In [0]:
K = 0.5

spmsk_model_q0 = SuperMaskQNets(k=K, q_head_index=0, q_means_stds=q_mu_s).to(device)
load_weights(spmsk_model_q0)

spmsk_model_q1 = SuperMaskQNets(k=K, q_head_index=1, q_means_stds=q_mu_s).to(device)
load_weights(spmsk_model_q1)

spmsk_model_q2 = SuperMaskQNets(k=K, q_head_index=2, q_means_stds=q_mu_s).to(device)
load_weights(spmsk_model_q1)

## Train models

In [0]:
"""
    Train/Test function for Randomly Weighted Hidden Neural Networks Techniques
    Adapted from https://github.com/NesterukSergey/hidden-networks/blob/master/demos/mnist.ipynb
"""

def train(model, device, train_loader, optimizer, criterion, verbose=False):
    
    train_loss = 0
    true_labels = []
    predictions = [] # labels
    outputs = [] # probabilities

    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()

        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        train_loss += loss
        output_value = output.detach().cpu().numpy()
        outputs.append(output)
        pred = (output_value > 0.5).astype(float)
        predictions.extend(pred)
        true_labels.extend(target.detach().cpu().numpy())

    predictions = np.array(predictions)
    true_labels = np.array(true_labels)
    outputs = np.array(outputs)
    train_loss /= len(train_loader.dataset)
    accuracy = np.sum(np.equal(predictions, true_labels)) / len(true_labels)
    auc = roc_auc_score(true_labels, predictions)

    return train_loss.item(), accuracy, auc


def test(model, device, criterion, test_loader):
    true_labels = []
    predictions = [] # labels
    outputs = [] # probabilities

    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)

            output = model(data)
            test_loss += criterion(output, target)

            output_value = output.detach().cpu().numpy()
            outputs.append(output)
            pred = (output_value > 0.5).astype(float)
            predictions.extend(pred)
            true_labels.extend(target.detach().cpu().numpy())

    predictions = np.array(predictions)
    true_labels = np.array(true_labels)
    outputs = np.array(outputs)
    test_loss /= len(test_loader.dataset)
    accuracy = np.sum(np.equal(predictions, true_labels)) / len(true_labels)
    auc = roc_auc_score(true_labels, predictions)

    return test_loss.item(), accuracy, auc

def run_model(model, num_epochs, verbose=False):
  # NOTE: only pass the parameters where p.requires_grad == True to the optimizer! Important!
  optimizer = optim.SGD(
      [p for p in model.parameters() if p.requires_grad],
      lr=0.1,
      momentum=0.9,
      weight_decay=0.0005,
  )

  criterion = nn.BCELoss().to(device)
  scheduler = CosineAnnealingLR(optimizer, T_max=14)

  train_accs = []
  train_aucs = []
  test_accs = []
  test_aucs = []

  for epoch in tqdm(range(num_epochs)):
      train_loss, train_accuracy, train_auc = train(model, device, tr_data_loader, optimizer, criterion, verbose=False)
      test_loss, test_accuracy, test_auc = test(model, device, criterion, val_data_loader)
      if verbose:
        print(f'Epoch {epoch}: train loss - {train_loss} / test loss {test_loss}')
      scheduler.step()

      train_accs.append(train_accuracy)
      train_aucs.append(train_auc)
      test_accs.append(test_accuracy)
      test_aucs.append(test_auc)

  print('Train accuracy: ', train_accs[-1])
  print('Test accuracy: ', test_accs[-1])

  print('Train AUC: ', train_aucs[-1])  
  print('Test AUC: ', test_aucs[-1])

num_epochs = 100
print()
run_model(spmsk_model_q0, num_epochs=num_epochs)
print()
run_model(spmsk_model_q1, num_epochs=num_epochs)
print()
run_model(spmsk_model_q2, num_epochs=num_epochs)

  0%|          | 0/100 [00:00<?, ?it/s]




100%|██████████| 100/100 [00:18<00:00,  5.28it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

Train accuracy:  0.98
Test accuracy:  0.6825
Train AUC:  0.9821428571428572
Test AUC:  0.68919779286927



100%|██████████| 100/100 [00:18<00:00,  5.30it/s]
  1%|          | 1/100 [00:00<00:18,  5.29it/s]

Train accuracy:  1.0
Test accuracy:  0.745
Train AUC:  1.0
Test AUC:  0.6975806451612904



 25%|██▌       | 25/100 [00:04<00:13,  5.36it/s]