In [1]:
from tqdm import tqdm_notebook as tqdm
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class Q_net(nn.Module):
    def __init__(self):
        super(Q_net, self).__init__()
        self.fc1 = nn.Linear(2,20)
        self.fc2 = nn.Linear(3,2)
        self.fc5 = nn.Linear(20,3)
    def forward(self, x):
        x = F.leaky_relu(self.fc1(x),0.1, True)
        x = self.fc5(x)
        return x

In [3]:
def train_epoch(model, device, replay, optimizer):
    gamma=0.9
    losses=list()
    
    for i, (S_t,A_t,R_tp,S_tp) in enumerate(replay):
        # processing replay    
        #if j==0:
            #print('S_t,A_t,R_tp,S_tp:',S_t,A_t,R_tp,S_tp)
        _,V=e_greedy(S_tp,e=0)
        _,target_new=e_greedy(S_t,e=0)
        
        act_ind=np.int32(A_t.cpu().numpy()-1)
        
        if torch.all(torch.eq(target_new,V)):
            r=R_tp
        else:
            r=R_tp+gamma*V.max()
        target_new[act_ind]=R_tp
        target_new=target_new.to(device)
        if i==0:
            target=target_new
            s=torch.tensor(S_t,dtype=torch.float32).cuda()
            S=Variable(s,requires_grad=True)
            data=S
        else:
            s=torch.tensor(S_t,dtype=torch.float32).cuda()
            S=Variable(s,requires_grad=True)
            data=torch.cat((data,S))
            target=torch.cat((target,target_new))
    target=target.view(i+1,3)
    data=data.view(i+1,2)
    if i+1>30:
        #print('Batch Size',30)
        target=target[-31:-1,:]
        data=data[-31:-1,:]
    #else:
        #print('Batch Size',i+1)
    model.train()
    for j in range(1):
        # train        
        optimizer.zero_grad()
        output = model.forward(data)
        loss = F.mse_loss(output, target)
        #print(loss)
        losses.append(loss.item())
        loss.backward(retain_graph=True)
        optimizer.step()
    
    return losses


In [4]:
def setup(lr=1e-2):
    device = torch.cuda.current_device()
    model = Q_net().to(device)
    optimizer = optim.RMSprop(model.parameters(), lr=lr)
    return device, model, optimizer

In [5]:
def evaluate(d=10):
    d=d+1
    V_1=np.zeros([d,d])
    V_2=np.zeros([d,d])
    V_3=np.zeros([d,d])
    for i in range(d):
        for j in range(d):
            a,V=e_greedy([i,j],e=0)
            V_1[i,j]=V[0].item()
            V_2[i,j]=V[1].item()
            V_3[i,j]=V[2].item()
    maX=np.max([np.max(V_1),np.max(V_2),np.max(V_3)])
    Min=np.min([np.min(V_1),np.min(V_2),np.min(V_3)])
    
    #V_1=(V_1-Min)/(maX-Min)
    #V_2=(V_2-Min)/(maX-Min)
    #V_3=(V_3-Min)/(maX-Min)
    
    ax1=plt.subplot(2,2,1)    
    plt.imshow(V_1,cmap=plt.get_cmap('viridis'),
               vmin=Min, vmax=maX)   
    ax2=plt.subplot(2,2,2)    
    plt.imshow(V_2,cmap=plt.get_cmap('viridis'),
               vmin=Min, vmax=maX)
    ax3=plt.subplot(2,2,3)    
    plt.imshow(V_3,cmap=plt.get_cmap('viridis'), 
               vmin=Min, vmax=maX)

In [6]:
def e_greedy(S,e=1e-2):
    s=torch.tensor(S,dtype=torch.float32)
    S=Variable(s,requires_grad=True)
    S=S.to(device)
    V=model.forward(S)
    v,ind = V.max(0)
    aa=[i for i, j in enumerate(V) if j == v]
    a=0
    if np.random.rand(1)<=1-e:
        if len(aa)==1:
            a=ind+1
        elif 0 in aa:
            a=1
        else:
            a=ind+1
    else:
        a=np.random.randint(1,4)
    a=torch.tensor(a,dtype=torch.float32)
    if np.isnan(a):
        print('NNNNNNNAn')
    return a,V

In [7]:
def Environment(f=1/3,R_plus=1,R_minus=20,e=1e-2,max_toss=8000):
    model.eval()
    S_0=np.array([0,0])
    replay=[]
    coin=np.random.randint(2,4)
    #print('coin',coin)
    if coin==3:
        S_list=S_0
        i=0
        while i<max_toss:
            if i==0:
                a,_=e_greedy(S_list,e=e)
            else:
                a,_=e_greedy(S_list[-1,:],e=e)
            #print(a)
            i=i+1
            if a!=1:
                if a==coin:
                    Reward=R_plus/max((len(S_list)-1),1)
                else:
                    Reward=-R_minus/max((len(S_list)-1),1)
                #print(S_list)
                if S_list.ndim==1:
                    replay.append([S_list,a,Reward,S_list])
                else:
                    replay.append([S_list[-1,:],a,Reward,S_list[-1,:]])
                break
            else:
                #rint(S_list)
                if np.random.rand(1)<1-f:
                    S_list=np.vstack((S_list,np.array([0,1])))
                    S_list[-1,:]=S_list[-1,:]+S_list[-2,:]
                else:
                    S_list=np.vstack((S_list,np.array([1,0])))
                    S_list[-1,:]=S_list[-1,:]+S_list[-2,:]
                Reward=0                
                replay.append([S_list[-2,:],a,Reward,S_list[-1,:]])
    else:
        S_list=S_0
        i=0
        while i<max_toss:
            if i==0:
                a,_=e_greedy(S_list,e=e)
            else:
                a,_=e_greedy(S_list[-1,:],e=e)
            #print(a)
            i=i+1
            if a!=1:
                if a==coin:
                    Reward=R_plus/max((len(S_list)-1),1)
                else:
                    Reward=-R_minus/max((len(S_list)-1),1)
                if S_list.ndim==1:
                    replay.append([S_list,a,Reward,S_list])
                else:
                    replay.append([S_list[-1,:],a,Reward,S_list[-1,:]])  
                break
            else:
                #print(S_list)
                if np.random.rand(1)<1-f:
                    S_list=np.vstack((S_list,np.array([1,0])))
                    S_list[-1,:]=S_list[-1,:]+S_list[-2,:]
                else:
                    S_list=np.vstack((S_list,np.array([0,1])))
                    S_list[-1,:]=S_list[-1,:]+S_list[-2,:]
                Reward=0
                replay.append([S_list[-2,:],a,Reward,S_list[-1,:]])
    return replay,S_list
                

In [8]:
device, model, optimizer=setup()
model

AssertionError: 
Found no NVIDIA driver on your system. Please check that you
have an NVIDIA GPU and installed a driver from
http://www.nvidia.com/Download/index.aspx

In [None]:
N=20000
for epoch in tqdm(range(N)):
    replay,_=Environment(e=0.3-0.1/(N-epoch))
    epoch_losses = train_epoch(model, device, replay, optimizer)
    if epoch%100==0:
        print(f"Average loss in epoch {epoch}: {np.mean(epoch_losses):.5f}")
        a_2,_=e_greedy([10,0],e=0)
        a_3,_=e_greedy([0,10],e=0)
        print(a_2.item(),a_3.item())
        if np.mean(epoch_losses)<=0.001 and (a_2.item()==2.0 and a_3.item()==3.0):
            print('Yeah!')
            torch.save(model, "/home/yucheng/Desktop/coins/Torch_DQN_for_RLcoin/models/3_layer_MLP_"+str(epoch)+".pt")

In [None]:
ep=0.33333
n=2
model=torch.load("/home/yucheng/Desktop/coins/Torch_DQN_for_RLcoin/models/3_layer_MLP_"+str(ep)+".pt")
evaluate(d=12)
#3 is [0,1], 2 is [1,0] 
print("No.",ep+N*n)

In [None]:
R=list()
for i in tqdm(range(1000)):
    replay,S_list=Environment(e=0)
    r=replay[-1]
    R.append(r[2])
print("Evaluated Average Reward from Greedy policy of the Q net ",np.mean(R))