In [None]:
import sys
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from datetime import datetime
import gym
import matplotlib
from tqdm import tqdm
matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
!pip install Box2D

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/My\ Drive/CMU/rl/hw3
%ls

In [None]:
class BanditEnv(gym.Env):
    '''
    Toy env to test your implementation
    The state is fixed (bandit setup)
    Note that the action takes integer values
    '''
    def __init__(self):
        self.action_space = gym.spaces.Discrete(10)
        self.observation_space = gym.spaces.Box(low=np.array([-1]), high=np.array([1]), dtype=np.float32)

    def reset(self):
        return np.array([0])

    def step(self, action):
        assert int(action) in self.action_space

        done = True
        s = np.array([0])
        r = -(action - 7)**2
        info = {}
        return s, r, done, info

In [None]:
def generate_episode(env, model, max_length=3000, render=False, train=True):
    rewards = []
    states = []
    # actions = []
    log_probs = []
    state = env.reset()
    for t in range(max_length):
        # breakpoint()
        state = torch.tensor(np.expand_dims(state, axis=0), dtype=torch.float, device=device) # shape (1, state_size)
        probs = model(state)
        # probs = F.softmax(out, dim=1)
        # action = np.random.choice(range(model.output_size), p=probs.squeeze().cpu().detach().numpy())

        dist = Categorical(probs=probs)
        action = dist.sample()
        state, reward, done, _ = env.step(action.item())
        # state, reward, done, _ = env.step(action)
        if render:
            env.render()
        
        log_prob = dist.log_prob(action)
        # log_prob = F.log_softmax(out, dim=1)[:, action]
        
        rewards.append(reward)
        if train:
            # actions.append(action)
            # states.append(state)
            log_probs.append(log_prob)

        if done:
            break
    
    return states, None, rewards, log_probs

In [None]:
epsilon = 1e-8

def get_g(rewards, gamma=0.99, normalize=False):
    g = []
    T = len(rewards)
    for t in range(T):
        g_t = np.power(np.full(T-t, fill_value=gamma), range(T-t)) @ rewards[-(T-t):]
        g.append(g_t)
    g = np.array(g)

    if normalize:
        g = (g - np.mean(g)) / (np.std(g) + epsilon)
    
    return g

def update_running_means(running_means, counts, rewards):
    running_means[:len(rewards)] = (running_means[:len(rewards)] * counts[:len(rewards)] + np.array(rewards)) / (counts[:len(rewards)] + 1)
    counts[:len(rewards)] += 1 # update counts only for those as part of trajectory
    return running_means, counts

In [None]:
def train(model, env, optimizer, scheduler, num_episodes, run_tests=True, k=100, gamma=0.99, normalize=False, baseline=None):
    rewards = []
    lengths = []
    losses = []
    print_interval = num_episodes // 100

    model.train()

    for e in tqdm(range(1, num_episodes+1), position=0, leave=True):
        optimizer.zero_grad()
        _, _, reward, log_prob = generate_episode(env, model, max_length=3000, render=False)
        g = get_g(reward, gamma, normalize=normalize)

        if baseline == "time-dependent":
            model.running_means, model.counts = update_running_means(model.running_means, model.counts, reward)
            g -= model.running_means[:len(g)]
        g = torch.tensor(g, dtype=torch.float, device=device)
        loss = - (g * torch.cat(log_prob)).mean()
        # import pdb; pdb.set_trace()
        losses.append(loss.item())
        loss.backward()
        optimizer.step()

        total_reward = np.sum(reward)
        rewards.append(total_reward)


        length = len(reward)
        lengths.append(length)

        # run test every k episodes (based on model's total count)
        if run_tests and model.num_episodes % k == 0:
            mean_reward, stdev_reward = test(model, env, num_episodes=100)

            # for LL only, print messages for every k (100) iters
            if model.env_name == "LunarLander-v2":
                print(f"\nEpisode #{e}. Total episodes:{model.num_episodes}")
                print(f"current loss:{loss.item()}")
                print(f"mean_loss: {np.mean(losses)}, reward (train): {total_reward}, mean_reward (train): {np.mean(rewards)}")
                print(f"mean_reward (eval):{mean_reward}, stdev_reward (eval):{stdev_reward}")
                print(f"avg trj len:{np.mean(lengths)}")
                print("--------------------------")
                rewards = []
                losses = []

            if scheduler:
                scheduler.step(mean_reward)
            
            torch.save(model.state_dict(), f"models/2_{model.env_name}_{model.num_episodes}_{datetime.now().isoformat('_').replace(':', '_')}.model")

        
        # for bandit, only print messages for every 5% of training progress
        if e % print_interval == 0 and not ( run_tests and model.num_episodes % k == 0):
            print(f"\nEpisode #{e}. Total episodes:{model.num_episodes}")
            print(f"mean_loss: {np.mean(losses)}, reward (train): {total_reward}, mean_reward (train): {np.mean(rewards)}")
            print("--------------------------")

        model.num_episodes += 1

    print(f"Finished training. Total episodes trained: {model.num_episodes}")

def test(model, env, num_episodes=100):
    model.eval()
    rewards = []
    with torch.no_grad():
        for e in range(1, num_episodes+1):
            _, _, reward, _ = generate_episode(env, model, max_length=3000, train=False)
            rewards.append(np.sum(reward))
    
    model.train()

    mean_reward, stdev_reward = np.mean(rewards), np.std(rewards)
    model.mean_rewards.append(mean_reward)
    model.stdev_rewards.append(stdev_reward)
    return mean_reward, stdev_reward

In [None]:
def plot_training(model, k=100):
    mean_rewards, stdev_rewards = model.mean_rewards, model.stdev_rewards

    above = []
    below = []

    # add and subtract the stdev to current mean
    for mu, sigma in zip(mean_rewards, stdev_rewards):
         above.append(mu + sigma)
         below.append(mu - sigma)
    # breakpoint()
    plt.figure(figsize=(12, 5))
    plt.plot(np.arange(len(mean_rewards)) * k, mean_rewards, label="mean_rewards")
    plt.fill_between(np.arange(len(mean_rewards)) * k, above, below, alpha=0.5, color='y', label="stdev_reward")
    plt.title(f"Mean/Stdev Eval Reward for {model.env_name}, REINFORCE, k={k}, trained: {model.num_episodes} episodes")
    plt.ylabel(f"Reward")
    plt.xlabel(f"# Episodes of training (eval every {k})")
    plt.legend()
    plt.savefig(f"p2_{model.env_name}_{datetime.now().isoformat('_').replace(':', '_')}.png", dpi=300)
    plt.show()

## Lunar Lander

In [None]:
class ModelTwo(nn.Module):
    def __init__(self, input_size, output_size, env_name):
        super(ModelTwo, self).__init__()

        self.input_size = input_size
        self.output_size = output_size
        self.num_episodes = 1
        self.env_name = env_name
        self.mean_rewards = []
        self.stdev_rewards = []

        self.running_means = np.zeros(3000)
        self.counts = np.zeros(3000)

        layers = [
            nn.Linear(input_size, 16), # layer 1
            nn.ReLU(),
            nn.Linear(16, 16), # layer 2
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, 16), # layer 3
            nn.ReLU(),
            nn.Linear(16, output_size), # out layer
        ]
        self.layers = nn.Sequential(*layers)
        self._init_weights()

    def forward(self, x):
        return F.softmax(self.layers(x), dim=1)

    def _init_weights(self):
        for m in self.layers:
            if isinstance(m, nn.Linear):
                # nn.init.xavier_uniform_(m.weight, gain=0.3)
                scale = 1.0
                n = np.mean(m.weight.shape)
                alpha = np.sqrt(3 * scale / n)
                nn.init.uniform_(m.weight, a=-alpha, b=alpha)
                # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                # nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
                nn.init.zeros_(m.bias)

In [None]:
env = gym.make('LunarLander-v2')

In [None]:
model = ModelTwo(input_size=env.observation_space.shape[0], output_size=env.action_space.n, env_name="LunarLander-v2")
# model.load_state_dict(torch.load("models/LunarLander-v2_6000_2020-10-29_19_13_06.266696.model"))
model = model.to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-5)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
# optimizer = adabound.AdaBound(model.parameters(), lr=1e-5, final_lr=0.1)
scheduler = None
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, threshold=10, patience=2, verbose=True)

In [None]:
k=100
train(model, env, optimizer, scheduler, num_episodes=1000, run_tests=True, k=k, gamma=0.99, normalize=True, baseline="time-dependent")

In [None]:
plot_training(model, k=200)

In [None]:
for param_group in optimizer.param_groups:
    param_group['lr']=1e-3

In [None]:
optimizer

In [None]:
stats = {"mean_rewards":model.mean_rewards,
 "stdev_rewards":model.stdev_rewards,
 "num_episodes":model.num_episodes}
torch.save(model.state_dict(), f"models/2_{model.env_name}_{model.num_episodes}_{datetime.now().isoformat('_').replace(':', '_')}.model")
torch.save(stats, f"models/stats_2_{model.env_name}_{model.num_episodes}_{datetime.now().isoformat('_').replace(':', '_')}")