In [1]:
!pwd

/home/jupyter-skenjeye@broadinst-05974/AutoTrain/experiments


In [9]:
# buggy, some kernel issue
!python -m pip install -e ../autotrain

Defaulting to user installation because normal site-packages is not writeable
Obtaining file:///home/jupyter-skenjeye%40broadinst-05974/AutoTrain/autotrain
Installing collected packages: autotrain
  Attempting uninstall: autotrain
    Found existing installation: autotrain 0.0.1
    Uninstalling autotrain-0.0.1:
      Successfully uninstalled autotrain-0.0.1
  Running setup.py develop for autotrain
Successfully installed autotrain


In [1]:
import matplotlib
import matplotlib.pyplot as plt

import torch

import numpy as np
import pandas as pd

import gym

import autotrain
import autotrain.gym_env 
import autotrain.agent.replay_memory as replay_memory

# if gpu is to be used
DEVICE = torch.device("cuda:3")

ModuleNotFoundError: No module named 'autotrain'

In [31]:
# agent import
import random
import math
from itertools import count
from pathlib import Path

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms

import autotrain.agent.dqn as dqn
import autotrain.agent.replay_memory as replay_memory

from tqdm.notebook import tqdm

# Agent Definitions

In [15]:
class DQN_MLP(nn.Module):
    def __init__(self, inputs, outputs):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(inputs, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, outputs),
            nn.Softmax(),
        )

    def forward(self, x):
        return self.net(x)

In [16]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

"""
agent TODO:
     - saving agent state
"""

class AutoTrainAgent: 

    def __init__(self, env, device):
        
        self.env = env
        self.device = device

        self.observation_dim = env.observation_space_dim
        
        # Get number of actions from gym action space
        self.n_actions = env.action_space_dim

        self.policy_net = dqn.DQN_MLP(self.observation_dim, self.n_actions).to(device)
        self.target_net = dqn.DQN_MLP(self.observation_dim, self.n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = replay_memory.ReplayMemory(10000)

        self.steps_done = 0


    def select_action(self, state):
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
            math.exp(-1. * self.steps_done / EPS_DECAY)
        self.steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=self.device, dtype=torch.long)
        
        
    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = replay_memory.Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), device=self.device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        
    def episode(self, i_episode):
        # Initialize the environment and state

        state = self.env.reset()
        
        for t in count():
            # Select and perform an action
            action = self.select_action(state)
            next_state, reward, done, _ = self.env.step(action.item())
            next_state = np.concatenate(next_state, axis=0) # for MLP, only one dim
            
            reward = torch.tensor([reward], device=self.device)

            # Store the transition in memory
            self.memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            self.optimize_model()
            if done:
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
        return t
            

# Environment

## Classifier Data Prep - MNIST

In [17]:
DATA_ROOT = Path('./data')
DATA_SPLIT = 0.6

ENV_PATH = Path('./autotrain-run')
ENV_PATH.mkdir(exist_ok=True)

In [54]:
CLASSES = range(10)

def splitds(train, test, no_signal=False, pct_cap=None):
    X = np.concatenate((train.data, test.data), axis=0)
    Y = list(train.targets) + list(test.targets)
    
    if pct_cap:
        cap = int(pct_cap*len(X))
        X, Y = X[:cap], Y[:cap]
        
    
    if no_signal:
        print('suffling labels')
        np.random.shuffle(Y)
    
    split_id = int(len(X) * DATA_SPLIT)
    train.data, train.targets = X[:split_id], Y[:split_id]
    test.data, test.targets = X[split_id:], Y[split_id:]

def get_dataset(tfms, no_signal=False, pct_cap=None):
    train = torchvision.datasets.MNIST(root=DATA_ROOT / 'mnist-data', train=True,
                                        download=True, transform=tfms)

    holdout = torchvision.datasets.MNIST(root=DATA_ROOT / 'mnist-data', train=False,
                                           download=True, transform=tfms)
        
    # splitds(train, holdout, no_signal, pct_cap)
    
    print(f'length of trainset: [{len(train)}]; len of holdout: [{len(holdout)}]')
    
    return train, holdout

In [55]:
TFMS = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])

train, holdout = get_dataset(TFMS, pct_cap=0.2)

length of trainset: [60000]; len of holdout: [10000]


In [56]:
def accuracy(data: torch.utils.data.DataLoader, model: nn.Module): # phi
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in tqdm(data,total=len(data)):
            images, labels = batch[0].to(DEVICE), batch[1]
            outputs = model(images).cpu()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    return correct / total

## Classifier Network Definition - SimpleConvNet

In [57]:
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

In [58]:
clf = Classifier()

## AutoTrain Environment

In [61]:
REWIND_DIM = 5
CLF_BS = 16
SAMPLE = 20 # sampling interval
BATCH_UPDATES = len(train) // CLF_BS * 3 # three epochs
LOSS_DIM = BATCH_UPDATES // SAMPLE
LOSS_DIM

562

In [62]:
env = gym.make('AutoTrain-v0')

ob = env.init(backbone=clf,  phi=accuracy, savedir=ENV_PATH,
         trnds=train, valds=holdout, 
         T=BATCH_UPDATES, H=REWIND_DIM, S=SAMPLE, lr_init=3e-4, inter_reward=0.05,
         num_workers=4, bs=CLF_BS, v=True, device=DEVICE)

[time_step:0]  initialised backbone parameters & optimizer
[time_step:0]  initialised phi value: started ...


HBox(children=(FloatProgress(value=0.0, max=3750.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))


[time_step:0]  initialised phi value: done
[time_step:0]  added observation
[time_step:0]  environment initialised : AutoTrainEnvironment with the following parameters:
                        lr_init=0.0003, inter_reward=0.05, H=5, K=562, T=11250


In [64]:
env.observation_space_dim

564

In [65]:
env.action_space_dim

17

# Agent

## Agent Training

In [None]:
# Set up matplotlib

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

episode_durations = []

def plot_durations():
    plt.figure(2, figsize=(12,6))
    plt.clf()
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())

In [None]:
# Train an agent

agent = Agent(env, DEVICE)

num_episodes = 5000
for i_episode in range(num_episodes):
    t = agent.episode(i_episode)
    episode_durations.append(t + 1)
    plot_durations()

display.clear_output(wait=True)
print('Complete')
cpenv.env.render()
cpenv.env.close()
plt.ioff()