# Cart Pole Balancer

## 1. Import Libraries

In [22]:
import time
import gym
import torch
import numpy as np
import random
import matplotlib
import torch.nn as nn
import torchvision as tv
import torchvision.transforms as T
import torch.nn.functional as F
import matplotlib.pyplot as plt
from itertools import count
from collections import OrderedDict,namedtuple
from PIL import Image
from collections import deque

## 2. Setup Display

In [23]:
# set up matplotlib
%matplotlib osx
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

## 3. Deep Q Network

In [24]:
class DQN_CONV(nn.Module):

    def __init__(self):
        super(DQN_CONV, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool1 = nn.MaxPool2d(3,1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.pool2 = nn.MaxPool2d(3,1)
        self.fc1 = nn.Linear(32 * 19 * 4, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 2)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool1(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool2(x)
        x = F.relu(self.fc1(x.view(x.size(0), -1)))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class DQN(nn.Module):
    def __init__(self, img_height, img_width):
        super().__init__()
            
        self.fc1 = nn.Linear(in_features=img_height*img_width*3, out_features=32)
        self.fc2 = nn.Linear(in_features=32, out_features=16)
        self.fc3 = nn.Linear(in_features=16, out_features=16)
        self.out = nn.Linear(in_features=16, out_features=2)
    

    def forward(self, t):
        t = t.flatten(start_dim=1)
        t = F.relu(self.fc1(t))
        t = F.relu(self.fc2(t))
        t = F.relu(self.fc3(t))
        t = self.out(t)
        return t

### Confirm that both nets are identical and returns the same resutls

In [25]:
policy_net = DQN_CONV()
target_net = DQN_CONV()
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

t = torch.randn(64,3,40,100)
with torch.no_grad():
    print(sum(policy_net(t)==target_net(t)))

tensor([0, 0])


## 4. Experience Class

In [26]:
Experience = namedtuple('Experience',('state','action','next_state','reward'))

In [27]:
e = Experience(1,2,3,4)

## 5. Replay Memory Class

In [28]:
class ReplayMemory(object):
    def __init__(self,capacity):
        self.capacity = capacity
        self.memory = deque(maxlen=self.capacity)
        
    def __len__(self):
        return len(self.memory)
    
    def store(self,experience):
        self.memory.append(experience)
    
    def sample(self,batch_size):
        batch_size = min(batch_size,len(self.memory))
        return random.sample(self.memory,batch_size)
    
    def can_provide_sample(self,batch_size):
        return batch_size <= len(self.memory)

## 6. Epsilon Greedy Strategy

In [29]:
class EpsilonGreedyStrategy(object):
    def __init__(self,start,end,decay):
        self.start = start
        self.end = end
        self.decay = decay
        
    def get_exploration_rate(self,current_step):
        return self.end + (self.start - self.end)*np.exp(-1 * self.decay * current_step)

## 7. Agent 

In [30]:
class Agent(object):
    def __init__(self,strategy,num_actions,device):
        self.strategy = strategy
        self.num_actions = num_actions
        self.current_step = 0
        self.explore = True
        self.device = device
        
    def select_action(self,state,policy_net):
        rate = self.strategy.get_exploration_rate(self.current_step)
        self.current_step += 1
        
        if rate > random.random():
            self.explore = True
            action = random.randrange(self.num_actions)
            return torch.tensor([action]).to(self.device) # Explore
        with torch.no_grad():
            self.explore = False
            return policy_net(state).argmax(dim=1).to(self.device) # Exploite

## 8. Environment Manager

In [31]:

class CartPoleEnvManager(object):
    def __init__(self,device):
        self.device = device
        self.env = gym.make('CartPole-v0').unwrapped
        self.env.reset()
        self.current_screen = None
        self.done = False
        self.world_width = self.env.x_threshold * 2
    
    def reset(self):
        self.env.reset()
        self.current_screen = None
        self.done = False
    
    def close(self):
        self.env.close()
    
    def render(self,mode='human'):
        return self.env.render(mode)
    
    def num_available_actions(self):
        return self.env.action_space.n
    
    def take_action(self,action):
        _,reward,self.done,_ = self.env.step(action.item())
        return torch.tensor([reward],device=self.device)
    
    def just_starting(self):
        return self.current_screen is None
    
    def get_cart_location(self,screen):
        loc = self.env.state[0]
        scale = screen.shape[2] / self.world_width
        return int(loc * scale + screen.shape[2] / 2)
        
    def get_state(self):
        if self.just_starting() or self.done:
            self.current_screen = self.get_processed_screen()
            black_screen = torch.zeros_like(self.current_screen)
            return black_screen
        else:
            s1 = self.current_screen
            s2 = self.get_processed_screen()
            self.current_screen = s2
            return s2-s1
    
    def get_screen_height(self):
        screen = self.get_processed_screen()
        return screen.shape[2]
    
    def get_screen_width(self):
        screen = self.get_processed_screen()
        return screen.shape[3]
    
    def get_processed_screen(self):
        screen = self.env.render('rgb_array').transpose(2,0,1)
        cart_loc = self.get_cart_location(screen)
        screen = self.crop_screen(screen,cart_loc)
        return self.transform_screen_data(screen)
    
    def crop_screen(self,screen,cart_loc):
        _,screen_height,screen_width = screen.shape
        delta = 400
        # Strip off the top and buttom of the screen
        top = int(0.4 * screen_height)
        buttom = int(0.8 * screen_height)
        if cart_loc > (screen_width - delta):
            screen = screen[:,top:buttom,screen_width-2*delta:]
        elif cart_loc < delta:
            screen = screen[:,top:buttom,:2*delta]
        else:
            screen = screen[:,top:buttom,cart_loc-delta:cart_loc+delta]
        return screen
    
    def transform_screen_data(self,screen):
        screen = np.ascontiguousarray(screen, dtype=np.float32)/255
        screen = torch.from_numpy(screen)
        
        self.transforms = T.Compose([T.ToPILImage(),
                                     #T.Grayscale(),
                                    T.Resize(40),
                                    T.ToTensor()])
        
        return self.transforms(screen).unsqueeze(0).to(self.device)


## 9. Plot it

In [32]:
def plot(values,moving_avg_period,episode):
    moving_avg = get_moving_avg(moving_avg_period,values)
    plt.figure(2)
    plt.clf()
    plt.title(f'Training episode:{episode} .... Max:{max(values)} ... MA:{int(moving_avg[-1])}')
    plt.xlabel('Episodes')
    plt.ylabel('Duration')
    plt.plot(values)
    plt.plot(moving_avg)
    plt.pause(0.001)
    if is_ipython: disp.clear_output(wait=True)

def get_moving_avg(period,values):
    values = torch.tensor(values,dtype=torch.float)
    if len(values) >= period:
        moving_avg = values.unfold(0,period,1).mean(dim=1).flatten(0)
        moving_avg = torch.cat([torch.zeros(period),moving_avg])
    else:
        moving_avg = torch.zeros(len(values))
    return moving_avg.numpy()

## 10. Hyperparameters:

In [33]:
batch_size = 10
gamma = 0.99
eps_start = 1
eps_end = 0.01
eps_decay = 0.001
target_update = 10
memory_size = 100_000
lr = 0.001
num_episodes = 10

## 11. Initiate Objects

In [35]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

em = CartPoleEnvManager(device)
strategy = EpsilonGreedyStrategy(eps_start,eps_end,eps_decay)
agent = Agent(strategy,em.num_available_actions(),device)
memory = ReplayMemory(memory_size)

policy_net = DQN(em.get_screen_height(),em.get_screen_width()).to(device)
target_net = DQN(em.get_screen_height(),em.get_screen_width()).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = torch.optim.Adam(policy_net.parameters(),lr=lr)
em.close()

In [37]:
em.close()

## 12. Training Loop

#### DQN Algorithm
1. Initialize replay memory capacity.  
2. Initialize the policy network with random weights.
3. Clone the policy network, and call it the target network.
4. For each episode:
    * Initialize the starting state.
    * For each time step:
        * Select an action.
            * Via exploration or exploitation
        * Execute selected action in an emulator.
        * Observe reward and next state.
        * Store experience in replay memory.
        * Sample random batch from replay memory.
        * Preprocess states from batch.
        * Pass batch of preprocessed states to policy network.
        * Calculate loss between output Q-values and target Q-values.
        * Requires a pass to the target network for the next state
        * Gradient descent updates weights in the policy network to minimize loss.
            * After x time steps, weights in the target network are updated to the weights in the policy network.

In [38]:
def extract_tensors(experiences):
    # converts batch of experiences to an experiences of batchs
    batch = Experience(*zip(*experiences))
    
    t1 = torch.cat(batch.state)
    t2 = torch.cat(batch.action)
    t3 = torch.cat(batch.reward)
    t4 = torch.cat(batch.next_state)
    
    return (t1,t2,t3,t4)

In [39]:
class QValues():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    @staticmethod
    def get_current(policy_net, states, actions):
        return policy_net(states).gather(dim=1,index=actions.unsqueeze(-1))
    
    @staticmethod        
    def get_next(target_net, next_states):                
        final_state_locations = next_states.flatten(start_dim=1) \
            .max(dim=1)[0].eq(0).type(torch.bool)
        non_final_state_locations = (final_state_locations == False)
        non_final_states = next_states[non_final_state_locations]
        batch_size = next_states.shape[0]
        values = torch.zeros(batch_size).to(QValues.device)
        values[non_final_state_locations] = target_net(non_final_states).max(dim=1)[0].detach()
        return values

#### Training Loop .......

In [40]:
episode_durations = []

for episode in range(num_episodes):
    em.reset()
    state = em.get_state()
    for timestep in count():
        action = agent.select_action(state,policy_net)
        reward = em.take_action(action)
        next_state = em.get_state()
        memory.store(Experience(state,action,next_state,reward))
        state = next_state
        
        if memory.can_provide_sample(batch_size):
            experiences = memory.sample(batch_size)
            states,actions,rewards,next_states = extract_tensors(experiences)
            
            current_q_values = QValues.get_current(policy_net, states, actions)
            next_q_values = QValues.get_next(target_net, next_states)
            target_q_values = (next_q_values * gamma) + rewards
            
            loss = F.mse_loss(current_q_values,target_q_values.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        if em.done:
            episode_durations.append(timestep)
            plot(episode_durations,100,episode)
            break
    if episode % target_update ==0:
        target_net.load_state_dict(policy_net.state_dict())
    em.close()

RuntimeError: cannot perform reduction function max on tensor with no elements because the operation does not have an identity

## End....

### Experementing 

.   
.   
.   
.   


In [21]:
target_net = torch.load('/Users/aymanjabri/Google Drive/CartPole/target_net_mixed',map_location='cpu')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/aymanjabri/Google Drive/CartPole/target_net_mixed'

In [75]:
target_net

DQN_CONV(
  (conv1): Conv2d(3, 16, kernel_size=(5, 5), stride=(2, 2))
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=3, stride=1, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 32, kernel_size=(5, 5), stride=(2, 2))
  (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=3, stride=1, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=2432, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=2, bias=True)
)

In [76]:
em = CartPoleEnvManager(device)
state  = em.get_state()
screen = em.get_processed_screen()
em.close()

In [77]:
state.shape

torch.Size([1, 3, 40, 100])

In [78]:
plt.imshow(screen.squeeze(0).permute(1,2,0))

<matplotlib.image.AxesImage at 0x7fbbb9ebe730>

In [79]:
def play(env,target_net,episodes):
    rewards = []
    for episode in range(episodes):
        env.reset()
        state = env.get_state().to(device)
        for steps in count():
            action = agent.select_action(state,target_net)
            reward = env.take_action(action)
            state = env.get_state().to(device)
            if env.done:
                rewards.append(steps)
                plot(rewards,10,episode)
                break
    env.close()

In [80]:
play(em,target_net,40)

In [87]:
action

NameError: name 'action' is not defined

In [None]:
Experience = namedtuple('Experience',('state','action','reward','next_state'))

class ReplayMemory(object):
    def __init__(self,capacity):
        self.capacity = capacity
        self.memory = deque(maxlen=self.capacity)
    
    def __len__(self):
        return len(self.memory)
    
    def store(self,*args):
        self.memory.append(*args)
    
    def sample(self,batch_size = 64):
        self.batch_size = batch_size
        
        if batch_size > len(self.memory):
            batch_size = len(self.memory)
        return random.sample(self.memory, batch_size)
    
    def train_batch(self,batch_size=64):
        loader = torch.utils.data.DataLoader(self.memory,batch_size=batch_size,shuffle=True,num_workers=2)
        return next(iter(loader))
    
    def add_episode(self,env):
        rewards = 0
        done = False
        env.reset()
        state = get_screen() ## I have to move this function to an agent
        while not done:
            action = env.action_space.sample()
            _,reward,done,_ = env.step(action)
            if done:
                reward = 0
            next_state = get_screen()
            self.store((state,action,reward,next_state))
            state = next_state
            rewards += reward

        print(f'Done with {rewards} rewards')

        time.sleep(1)
        env.close()


data = ReplayMemory(100_000)

In [None]:
fig,ax = plt.subplots(1,2)
fig.suptitle(f'Action: {action[terminal][0]}')
ax[0].imshow(state[terminal][0].squeeze(0),cmap='gray')
ax[0].set_title('State')
ax[1].imshow(next_state[terminal][0].squeeze(0),cmap='gray')
ax[1].set_title('Next State')
plt.show()