In [0]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
import gym
import os

In [0]:
class Actor(nn.Module):
    def __init__(self, num_actions):
        super().__init__()
        self.fc2 = nn.Linear(8, 2048)
        self.fc3 = nn.Linear(2048, 512)
        self.pi = nn.Linear(512, num_actions)
        
    def forward(self, x):
#         x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        pi = self.pi(x)
        return pi

In [0]:
class Critic(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc2 = nn.Linear(8, 2048)
        self.fc3 = nn.Linear(2048, 512)
        self.v = nn.Linear(512, 1)
        
    def forward(self, x):
#         x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        v = self.v(x)
        return v

In [0]:
class Agent():
    def __init__(self):
        self.gamma = 0.99
        self.log_probs = None
        self.env = gym.make('LunarLander-v2')
        num_actions = self.env.action_space.n
        self.actor = Actor(num_actions=num_actions).cuda()
        self.critic = Critic().cuda()
        self.MODEL_PATH_ACTOR = '/content/actor.pth'
        self.MODEL_PATH_CRITIC = '/content/critic.pth'
        if os.path.exists(self.MODEL_PATH_ACTOR):
            print('Existing model found!')
            self.actor.load_state_dict(torch.load(self.MODEL_PATH_ACTOR))
            self.actor.eval()
            self.critic.load_state_dict(torch.load(self.MODEL_PATH_CRITIC))
            self.critic.eval()

        else:
            print('No existing model.')
        self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=1e-5)
        self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=1e-5)
        self.MAX_FRAMES = 10000
        self.NUM_EPISODES = 3000
        
    def choose_action(self, state):
        policy = self.actor(torch.tensor(state).cuda())
        probablities = F.softmax(policy)
        action_distribution = torch.distributions.Categorical(probablities)
        action = action_distribution.sample()
        self.log_probs = action_distribution.log_prob(action)
        return action.item()
    
    def learn(self, state, next_state, reward, done):
#         with torch.no_grad():
        self.optimizer_actor.zero_grad()
        self.optimizer_critic.zero_grad()
        curr_value = self.critic(torch.tensor(state).cuda())
        next_value = self.critic(torch.tensor(next_state).cuda())
        reward = torch.tensor(reward).cuda()
        
        if done:
            advantage = reward - curr_value
        else:
            advantage = reward + self.gamma * next_value - curr_value
            
        actor_loss = -1 * self.log_probs * advantage
        critic_loss = advantage ** 2
        
        loss = actor_loss + critic_loss
#         loss.item()
        loss.backward()
        self.optimizer_actor.step()
        self.optimizer_critic.step()
        return loss.item()
        
    def play(self):
        for i_episode in range(self.NUM_EPISODES):
            observation = self.env.reset()
            curr_state = observation
            done = False
            total_rewards = 0
            total_loss = 0
            num_frames = 0
            while not done:
                action = self.choose_action(curr_state)
                observation, reward, done, info = self.env.step(action)
                loss = self.learn(state=curr_state, next_state=observation, reward=reward, done=done)
                curr_state = observation
                total_rewards += reward
                total_loss += loss
                num_frames += 1
                
            print(f'Episode # {i_episode} done, total reward: {total_rewards}, loss: {total_loss/num_frames}')
            
            
            
        self.env.close()

In [7]:
    agent = Agent()

No existing model.


In [8]:
    agent.play()



Episode # 0 done, total reward: -111.20331655240723, loss: 186.37916746541546
Episode # 1 done, total reward: -194.00904098007453, loss: 86.2409701214026
Episode # 2 done, total reward: -435.2643422012953, loss: 100.33772493803242
Episode # 3 done, total reward: -242.19643280898026, loss: 96.90364371609051
Episode # 4 done, total reward: -158.23566798459362, loss: 185.8809051828184
Episode # 5 done, total reward: -110.25423039762684, loss: 134.66858506627764
Episode # 6 done, total reward: -371.21246074688395, loss: 112.71937527131094
Episode # 7 done, total reward: -420.1517562992408, loss: 106.30550006447503
Episode # 8 done, total reward: -209.67430917664404, loss: 156.60961353841802
Episode # 9 done, total reward: -291.8720353826899, loss: 135.77130299157056
Episode # 10 done, total reward: -139.30096151800717, loss: 129.95376710240035
Episode # 11 done, total reward: -195.05877631239576, loss: 108.07602794090482
Episode # 12 done, total reward: -247.89799023092343, loss: 76.941614