In [None]:
import pybullet_envs
# Don't forget to install PyBullet!
from gym import make
import torch
from torch import nn
from torch.distributions import Normal
from torch.nn import functional as F
from torch.optim import Adam
import random
from tqdm import tqdm
import numpy as np
from itertools import product
from matplotlib import pyplot as plt
import pandas as pd
import joblib
import seaborn as sns
sns.set()

In [None]:
def plot(log):
    fig, ax = plt.subplots(figsize=(12, 8))
    cfg = (f'alpha: {log["alpha"][0]} | '
           f'gamma: {log["gamma"][0]} | '
           f'eps: {log["eps_max"][0]}, {log["eps_min"][0]} | '
           f'buffer: {log["buffer_size"][0]} | '
           f'batch: {log["batch_size"][0]} | '
           f'tau: {log["tau"][0]}')
    ax.set_title(f"Mean reward over 10 episodes @ {cfg}")
    ax.set_xlabel("№ of transitions")
    ax.set_ylabel("Mean reward")

    plt.hlines(200, np.min(log["step"]), np.max(log["step"]),
               colors="r", label="Solved")

    plt.plot(log["step"],
             log["reward_mean"],
             label="DQN")

    plt.fill_between(log["step"],
                     log["reward_mean"] - log["reward_std"],
                     log["reward_mean"] + log["reward_std"], alpha=0.5)
    plt.legend()
    plt.show()

def collect_data(alpha, gamma, eps_max, eps_min,
                 start_training,
                 buffer_size,
                 batch_size,
                 hidden_size,
                 tau):
    env = make("LunarLander-v2")
    agent = DQN(env=env,
                state_dim=env.observation_space.shape[0], 
                action_dim=env.action_space.n, 
                alpha=alpha, 
                gamma=gamma, 
                hidden_size=hidden_size, 
                tau=tau)
    
    return agent, pd.DataFrame(agent.train(1_000_000,
                                           buffer_size=buffer_size,
                                           batch_size=batch_size,
                                           eps_max=eps_max,
                                           eps_min=eps_min,
                                           start_training=start_training
                                           ))

def print_info(log):
    r_max_idx = np.argmax(log["reward_mean"])
    print(f"Max mean reward {log['reward_mean'][r_max_idx]} @ {log['step'][r_max_idx]}")