In [140]:
import gym
from collections import deque
import numpy as np
import random
import torch
import pdb
import torch.optim as optim 
from torch import nn

In [141]:
env = gym.make('CartPole-v0')

In [142]:
class Brain(nn.Module):
    def __init__(self,state_dim,action_space):
        super(Brain, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(state_dim, 12),
            nn.ReLU(),
            nn.Linear(12, 24),
            nn.ReLU(),
            nn.Linear(24, 48),
            nn.ReLU(),
            nn.Linear(48, action_space),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [146]:
class Agent:
    def __init__(self,env):
        self.env=env
        self.gamma=0.96
        self.epsilon = 0.15
        self.learingRate=0.001
        self.replayBuffer=deque(maxlen=1000)
        self.trainNetwork  = Brain(4,2)
        self.targetNetwork = Brain(4,2)
        self.episodeNum=200
        self.iterationNum=200 
        self.numPickFromBuffer=32
        self.rewardWindow=[]
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.trainNetwork.parameters(), lr=0.001)
        
    def bestAction(self,state):
        if np.random.rand() < self.epsilon:
            action = np.random.randint(0, 2)
        else:
            state = torch.tensor(state,dtype=torch.float32)
            action=torch.argmax(self.trainNetwork(state))
        return int(action)
        
    def trainBrain(self):
        if len(self.replayBuffer) < self.numPickFromBuffer:
            return 
        
        currents = []
        news = []
        rewards = []
        actions = []
        
        samples = random.sample(self.replayBuffer,self.numPickFromBuffer)
        
        for curr in samples:
            currents.append(curr[0])
        currents = np.array(currents)
        
        for new in samples:
            news.append(new[3])
        news = np.array(news)
        
        for reward in samples:
            rewards.append(reward[2])
        rewards = np.array(rewards)
        
        for action in samples:
            actions.append(action[1])
            
        
        
        next_Q = self.targetNetwork(torch.tensor(news,dtype=torch.float32))
        current_q = self.trainNetwork(torch.tensor(currents,dtype=torch.float32))
        
        new_q = rewards + self.gamma*np.max(next_Q.detach().numpy(),axis=1)
        
        
        for i in range(len(actions)):
            current_q[i,actions[i]] = new_q[i]
            
#         pdb.set_trace()
        x = torch.tensor(currents,dtype=torch.float32)
        y = torch.tensor(current_q,dtype=torch.float32)
        
        out = self.trainNetwork(x)
        loss = self.criterion(out, y)
#         print(loss)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        
    def play(self):
        current_state = self.env.reset()
        while True:
            current_state = torch.tensor(current_state,dtype=torch.float32)
            action=torch.argmax(self.trainNetwork(current_state))
            new_state, reward, done, _ = env.step(int(action))
            current_state = new_state
            self.env.render()
            if done:
                current_state = self.env.reset()
        
        
    def main(self):
        for it in range(self.iterationNum):
            current_state = self.env.reset()
            for eps in range(self.episodeNum):
                action = self.bestAction(current_state)
#                 if it >50:
#                     print(action)
                new_state, reward, done, _ = self.env.step(action)
#                 if reward == 1:
#                     print(f'success at iteration {it} and episode {eps}')
                self.replayBuffer.append([current_state, action, reward, new_state, done])
                self.trainBrain()
                current_state = new_state  
#                 print(f'current state is {current_state[0]}')
#                 if done:
#                     break
                    
                    
            self.targetNetwork.load_state_dict(self.trainNetwork.state_dict())


                
        
        

In [147]:
agent = Agent(env)

In [154]:
agent.main()



In [155]:
agent.play()

KeyboardInterrupt: 

In [75]:
current_state = env.reset()
print(current_state)
print('-------------------------')
while True:
#     current_state = torch.tensor(current_state,dtype=torch.float32)
#     action=torch.argmax(agent.targetNetwork(current_state))  
    action = int(2)
    new_state, reward, done, _ = env.step(action)
    rep.append([current_state, action, reward, new_state, done])
    current_state = new_state
    print(current_state)
    env.render()
    if done:
        break
env.close()

[-0.41497604  0.        ]
-------------------------
[-4.14776364e-01  1.99671413e-04]
[-4.14378440e-01  3.97924245e-04]
[-0.41378509  0.00059335]
[-0.41300052  0.00078457]
[-0.41203031  0.00097021]
[-0.41088132  0.00114899]
[-0.4095617   0.00131963]
[-0.40808076  0.00148093]
[-0.40644898  0.00163179]
[-0.40467784  0.00177114]
[-0.40277981  0.00189803]
[-0.4007682  0.0020116]
[-0.39865713  0.00211108]
[-0.39646132  0.0021958 ]
[-0.39419609  0.00226523]
[-0.39187719  0.0023189 ]
[-0.3895207  0.0023565]
[-0.38714289  0.00237781]
[-0.38476015  0.00238274]
[-0.38238885  0.0023713 ]
[-0.38004524  0.00234362]
[-0.3777453   0.00229994]
[-0.37550469  0.00224061]
[-0.3733386   0.00216609]
[-0.37126169  0.00207691]
[-0.36928794  0.00197374]
[-0.36743063  0.00185731]
[-0.3657022   0.00172843]
[-0.36411421  0.001588  ]
[-0.36267723  0.00143698]
[-0.36140082  0.00127641]
[-0.36029346  0.00110736]
[-0.35936248  0.00093098]
[-0.35861403  0.00074845]
[-0.35805307  0.00056097]
[-0.35768328  0.00036979]


In [37]:
inp = torch.tensor(np.random.rand(10,2),dtype=torch.float32)
tar = torch.tensor(np.random.rand(10,3),dtype=torch.float32)
criterion = nn.MSELoss()


In [38]:
out = dqn(inp)

In [39]:
out.shape

torch.Size([10, 3])

In [46]:
np.max(out.detach().numpy(),axis=1)

array([0.27805674, 0.33564025, 0.37856692, 0.31040233, 0.38727546,
       0.25115293, 0.42456454, 0.35666302, 0.25552255, 0.42150941],
      dtype=float32)

In [40]:
loss = criterion(out, tar)

In [61]:
d = deque(maxlen=50)

In [62]:
current_state = env.reset()
for eps in range(150):
    action = np.random.randint(0,3)
    new_state, reward, done, _ = env.step(action)
    d.append([current_state, action, reward, new_state, done])
    current_state = new_state

In [63]:
currents = []
news = []
rewards = []
actions = []

samples = random.sample(d,3)

for curr in samples:
    currents.append(curr[0])
currents = np.array(currents)

for new in samples:
    news.append(new[3])
news = np.array(news)

for reward in samples:
    rewards.append(reward[2])
rewards = np.array(rewards)

for action in samples:
    actions.append(action[1])

In [64]:
currents

array([[-0.35872627,  0.0041703 ],
       [-0.39517037, -0.00817855],
       [-0.72402762, -0.0013956 ]])

In [65]:
news

array([[-3.56742706e-01,  1.98356053e-03],
       [-4.05288482e-01, -1.01181093e-02],
       [-7.24008959e-01,  1.86583322e-05]])

In [66]:
actions

[0, 0, 1]

In [67]:
rewards

array([-1., -1., -1.])

In [68]:
next_Q = agent.targetNetwork(torch.tensor(news,dtype=torch.float32))
current_q = agent.trainNetwork(torch.tensor(currents,dtype=torch.float32))

new_q = rewards + 1*np.max(next_Q.detach().numpy(),axis=1)
print(current_q)
print('-----------------------------')

for i in range(len(actions)):
    current_q[i,actions[i]] = new_q[i]
print(current_q)

tensor([[-9.9953, -9.9969, -9.9971],
        [-9.9950, -9.9969, -9.9970],
        [-9.9953, -9.9971, -9.9973]], grad_fn=<AddmmBackward>)
-----------------------------
tensor([[-10.9952,  -9.9969,  -9.9971],
        [-10.9949,  -9.9969,  -9.9970],
        [ -9.9953, -10.9954,  -9.9973]], grad_fn=<CopySlices>)


In [101]:
env = gym.make('CartPole-v0')

In [151]:
current_state = env.reset()
while True:
    action = 1
    new_state, reward, done, _ = env.step(action)
    print(reward)
    d.append([current_state, action, reward, new_state, done])
    current_state = new_state
#     if done:
#         break
    env.render()
env.close()

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


KeyboardInterrupt: 

In [103]:
env.observation_space

Box(4,)

In [124]:
env.close()