In [1]:
from tqdm import tqdm_notebook as tqdm
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class Q_RNN(nn.Module):
    def __init__(self,input_size,hidden_size,num_layers):
        super(Q_RNN, self).__init__()
        self.r1=nn.RNN(input_size, hidden_size,num_layers, nonlinearity='tanh')
        self.fc1 = nn.Linear(hidden_size,8)
        self.fc2 = nn.Linear(8,3)
    def forward(self,X0,num_layers,hidden_size): 
        h0=torch.zeros([num_layers,X0.shape[1],hidden_size])  
        output, hn = self.r1(X0,h0)
        Q=self.fc1(output)
        Q=F.leaky_relu(Q,0.1, True)
        Q=self.fc2(Q)
        q=Q[-1,:,:]
        return Q,hn,q

In [3]:
def setup(lr=2e-2,input_size=2, hidden_size=4,num_layers=1):
    model = Q_RNN(input_size,hidden_size,num_layers)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    return model, optimizer

In [4]:
def e_greedy(S,e=1e-2,num_layers=1,hidden_size=4):
    s=torch.tensor(S,dtype=torch.float32)
    S=Variable(s,requires_grad=True)
    _,_,V=model.forward(S,num_layers,hidden_size)
    V=V[-1,:]
    v,ind = V.max(0)
    aa=[i for i, j in enumerate(V) if j == v]
    a=0
    if np.random.rand(1)<=1-e:
        if len(aa)==1:
            a=ind.item()+1
        elif 0 in aa:
            a=1
        else:
            a=ind.item()+1
    else:
        a=np.random.randint(1,4)
    a=torch.tensor(a,dtype=torch.float32)
    if np.isnan(a):
        print('NNNNNNNAn')
    return a,V

In [5]:
def Environment(f=0.1,R_plus=1,R_minus=20,e=1e-2,max_toss=8000,hidden_size=8,num_layers=2):
    model.eval()
    S_0=np.zeros([1,1,2])
    a_0=np.zeros([1,1,2])
    replay=[]
    Rplay=[]
    coin=np.random.randint(2,4)

    if coin==3:
        S_list=S_0
        o_list=S_0
        i=0
        while i<max_toss:
            a,_=e_greedy(o_list,e=e,num_layers=num_layers,hidden_size=hidden_size)
            #print(a)
            i=i+1
            if a!=1:
                if a==coin:
                    Reward=R_plus
                else:
                    Reward=-R_minus
                Rplay.append([i-1,a,Reward,i-1])
                if S_list.ndim==1:
                    replay.append([S_list,a,Reward,S_list])
                else:
                    replay.append([S_list[-1,:],a,Reward,S_list[-1,:]])
                break
            else:
                #rint(S_list)
                if np.random.rand(1)<1-f:
                    S_list=np.concatenate((S_list,np.array([[[0,1]]])),axis=0)
                    o_list=np.concatenate((o_list,np.array([[[0,1]]])),axis=0)
                    S_list[-1,:,:]=S_list[-1,:,:]+S_list[-2,:,:]
                else:
                    S_list=np.concatenate((S_list,np.array([[[1,0]]])),axis=0)
                    o_list=np.concatenate((o_list,np.array([[[1,0]]])),axis=0)
                    S_list[-1,:,:]=S_list[-1,:,:]+S_list[-2,:,:] 
                Reward=0   
                Rplay.append([i-1,a,Reward,i])
                replay.append([S_list[-2,:],a,Reward,S_list[-1,:]])
    else:
        S_list=S_0
        o_list=S_0
        i=0
        while i<max_toss:
            a,_=e_greedy(o_list,e=e,num_layers=num_layers,hidden_size=hidden_size)
            #print(a)
            i=i+1
            if a!=1:
                if a==coin:
                    Reward=R_plus
                else:
                    Reward=-R_minus
                Rplay.append([i-1,a,Reward,i-1])
                if S_list.ndim==1:
                    replay.append([S_list,a,Reward,S_list])
                else:
                    replay.append([S_list[-1,:],a,Reward,S_list[-1,:]])
                break
            else:
                #print(S_list)
                if np.random.rand(1)<1-f:
                    S_list=np.concatenate((S_list,np.array([[[1,0]]])),axis=0)
                    o_list=np.concatenate((o_list,np.array([[[1,0]]])),axis=0)
                    S_list[-1,:,:]=S_list[-1,:,:]+S_list[-2,:,:]
                else:
                    S_list=np.concatenate((S_list,np.array([[[0,1]]])),axis=0)
                    o_list=np.concatenate((o_list,np.array([[[0,1]]])),axis=0)
                    S_list[-1,:,:]=S_list[-1,:,:]+S_list[-2,:,:]
                Reward=0
                Rplay.append([i-1,a,Reward,i])
                replay.append([S_list[-2,:],a,Reward,S_list[-1,:]])
    R=Reward/max((len(S_list)-1),1)
    for x in Rplay[:-1]:
        x[2]=-Reward/(len(S_list)-1)**2*(len(S_list)-2)
    return replay,S_list,Rplay,o_list

In [6]:
def get_target(Rplay,Q):
    Target=Q
    for i,R in enumerate(Rplay):
        Target[i,:,np.int32(R[1].cpu().numpy()-1)]=R[2]
    return Target

In [7]:
def train_epoch(model, Rplay,A_list,optimizer,hidden_size=4,num_layers=1):
    gamma=0.9
    losses=list()
    A_list=torch.tensor(A_list,dtype=torch.float32)
    A_list=Variable(A_list,requires_grad=True)
    Q,_,_=model(A_list,num_layers,hidden_size) 
    Target=get_target(Rplay,Q)   

    model.train()
    for j in range(1):
        # train        
        optimizer.zero_grad()
        Q,_,_=model(A_list,num_layers,hidden_size)  
        loss = F.mse_loss(Q, Target)
        #print(loss)
        losses.append(loss.item())
        loss.backward(retain_graph=True)
        optimizer.step()
    
    return losses

In [8]:

model, optimizer=setup(hidden_size=8,num_layers=2)

In [None]:
N=10000
for epoch in tqdm(range(N)):
    _,_,Rp,al=Environment(hidden_size=8,num_layers=2)
    epoch_losses =train_epoch(model, Rp,al,optimizer,hidden_size=8,num_layers=2)
    if epoch%100==0:
        print(f"Average loss in epoch {epoch}: {np.mean(epoch_losses):.5f}")

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

Average loss in epoch 0: 136.90639
Average loss in epoch 100: 1.41397
Average loss in epoch 200: 0.19934
Average loss in epoch 300: 0.10408
Average loss in epoch 400: 0.50911
Average loss in epoch 500: 0.95753
Average loss in epoch 600: 0.22325
Average loss in epoch 700: 0.25765
Average loss in epoch 800: 1.17792
Average loss in epoch 900: 3.91364
Average loss in epoch 1000: 1.01740
Average loss in epoch 1100: 0.10024
Average loss in epoch 1200: 0.22798
Average loss in epoch 1300: 0.48575
Average loss in epoch 1400: 0.11087
Average loss in epoch 1500: 0.15894
Average loss in epoch 1600: 0.05780
Average loss in epoch 1700: 0.03770


In [None]:
R=list()
for i in tqdm(range(1000)):
    replay,_,_,_=Environment(e=0.1)
    r=replay[-1]
    R.append(r[2])
print("Evaluated Average Reward from Greedy policy of the Q net ",np.mean(R))

In [None]:
torch.save(model, "C:/Users/YYC/Desktop/DQN_RLcoin/models/rnn.pt")