In [1]:
# !pip install joblib six
# !pip install 'gym[all]'
# !pip install pyglet==1.2.4
# !pip install pyvirtualdisplay
# !pip install --no-cache-dir -I pillow

In [1]:
import matplotlib.pyplot as plt
import gym
import torch
import numpy as np
import scipy
from joblib import Parallel, delayed
import matplotlib.animation as animation 
from IPython.core.debugger import set_trace
from datetime import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline

In [130]:
def select_elites(states_batch,actions_batch,rewards_batch,percentile=50):
    """
    Select states and actions from games that have rewards >= percentile
    :param states_batch: list of lists of states, states_batch[session_i][t]
    :param actions_batch: list of lists of actions, actions_batch[session_i][t]
    :param rewards_batch: list of rewards, rewards_batch[session_i][t]
    
    :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions
    
    Please return elite states and actions in their original order 
    [i.e. sorted by session number and timestep within session]
    
    If you're confused, see examples below. Please don't assume that states are integers (they'll get different later).
    """
    
    reward_threshold = np.percentile(rewards_batch, percentile)
    print(reward_threshold)
    states_batch = np.array(states_batch)
    actions_batch = np.array(actions_batch)
    
    elite_mask = rewards_batch > reward_threshold
    elite_states  = states_batch[elite_mask]
    elite_actions = actions_batch[elite_mask]
#     set_trace()

#     if elite_states != []:
    elite_states = np.concatenate(elite_states)
    elite_actions = np.concatenate(elite_actions)
    
    return elite_states,elite_actions

In [3]:
from IPython.display import clear_output

def show_progress(rewards_batch,log):
    """
    A convenience function that displays training progress. 
    No cool math here, just charts.
    """
    
    mean_reward = np.mean(rewards_batch)
    threshold = np.percentile(rewards_batch,percentile)
    log.append([mean_reward,threshold])

    clear_output(True)
    print("mean reward = %.3f, threshold=%.3f"%(mean_reward,threshold))
    plt.figure(figsize=[8,4])
    plt.subplot(1,2,1)
    plt.plot(list(zip(*log))[0],label='Mean rewards')
    plt.plot(list(zip(*log))[1],label='Reward thresholds')
    plt.legend()
    plt.grid()
    
    plt.subplot(1,2,2)
    plt.hist(rewards_batch,range=[min(rewards_batch), max(rewards_batch)]);
    plt.vlines([np.percentile(rewards_batch,percentile)],[0],[100],label="percentile",color='red')
    plt.legend()
    plt.grid()

    plt.show()


In [4]:
env = gym.make("MountainCar-v0").env
env.reset()
n_actions = env.action_space.n

# plt.imshow(env.render("rgb_array"));

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [5]:
n_actions

3

In [7]:
# from sklearn.neural_network import MLPClassifier

# agent = MLPClassifier(hidden_layer_sizes=(20,20),
#                       activation='tanh',
#                       warm_start=True, #keep progress between .fit(...) calls
#                       max_iter=1, #make only 1 iteration on each .fit(...)
#                      )

# #initialize agent to the dimension of state an amount of actions
# agent.fit([env.reset()]*n_actions, range(n_actions));



In [196]:
class SineNet(torch.nn.Module):
    def __init__(self, n_hidden_neurons):
        super(SineNet, self).__init__()
        self.fc1 = torch.nn.Linear(2, n_hidden_neurons)
        self.act1 = torch.nn.Tanh()
        self.fc2 = torch.nn.Linear(n_hidden_neurons, 3)
        self.sm = torch.nn.Softmax()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.act1(x)
        x = self.fc2(x)
        x = self.sm(x)
        return x
    
def loss(pred, targ):
    res = (pred - targ) ** 2
    return res.mean()

agent = SineNet(20)
optimizer = torch.optim.Adam(agent.parameters(), lr=1.0e-1)

In [197]:
def generate_session(t_max=10000):
    states,actions = [],[]
    total_rewards = 0
    s = env.reset()
    
    for t in range(t_max):
        tensor_s = torch.autograd.Variable(torch.Tensor(s))
        probs = agent.forward(tensor_s).data.numpy()
        a = np.random.choice(n_actions, p=probs)
        new_s, r, done, _ = env.step(a)
        
        states.append(s)
        actions.append(a)
        total_rewards += r
        s = new_s[:]
        if done: break
    return states, actions, total_rewards
        

In [198]:
def fit(x,y, agent):
    batch_size = 100
    order = np.random.permutation(x.shape[0])

    for start_index in range(0, x.size, batch_size):
        optimizer.zero_grad()

        batch_items = order[start_index:start_index+batch_size]
        if batch_items.size == 0:
            break
            
        x_batch = torch.FloatTensor(x[batch_items])#.reshape([-1, 1]))
        y_batch = torch.FloatTensor(y[batch_items].reshape([-1, 1]))

        x_var = torch.autograd.Variable(x_batch)
        y_var = torch.autograd.Variable(y_batch)

        y_pred = agent.forward(x_var)
        loss_val = loss(y_pred, y_var)

        loss_val.backward()

        optimizer.step()

        print(x_batch.shape, y_batch.shape)

In [199]:
# x = torch.Tensor([env.reset()]*(n_actions+2))
# y = torch.Tensor(list(range(n_actions)) + [2,2])
x = torch.Tensor([env.reset()]*n_actions)
y = torch.Tensor(list(range(n_actions)))
for _ in range(2):
    for i in range(len(x)):
        optimizer.zero_grad()

        x_train = torch.autograd.Variable(x[i])
        y_pred = agent.forward(x_train)
        loss_val = loss(y_pred, y[i])
        loss_val.backward()
        optimizer.step()
    
agent.forward(torch.autograd.Variable(torch.Tensor([-0.53422085, 0.]))).data.numpy()

  del sys.path[0]


array([0.32615387, 0.3552597 , 0.31858644], dtype=float32)

In [None]:
%%time
n_sessions = 300
percentile = 90
log = []
epochs = 100
value_to_stop = -200

for i in range(epochs):
    t = dt.now()
    sessions = Parallel(n_jobs=4)(delayed(generate_session)() for _ in range(n_sessions))
#     sessions = [generate_session() for _ in range(n_sessions)]
    states_batch, actions_batch, rewards_batch = map(np.array, zip(*sessions))
    elite_states, elite_actions = select_elites(states_batch,
                                                actions_batch,
                                                rewards_batch,
                                                percentile=percentile)

    fit(elite_states, elite_actions, agent)
    
    show_progress(rewards_batch, log)
    print("iteration time: {t} s".format(t=(dt.now() - t).seconds))
    if np.mean(rewards_batch) > value_to_stop:
        print("You win")
        break
    

  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
