In [1]:
import numpy as np
from Policy_Agent import ActorCritic
import matplotlib.pyplot as plt
import torch
from torchvision.models.feature_extraction import create_feature_extractor
import math
from torch.autograd import Variable

In [2]:
def mkDataSet(data_size, data_length=50, freq=60.):
    train_x = []
    train_t = []

    for offset in range(data_size):
        train_x.append([[math.sin(2*math.pi*(offset+i)/freq)+np.random.normal(loc=0.0, scale=0.015)] for i in range(data_length)])
        train_t.append([math.sin(2*math.pi*(offset+data_length)/freq)])

    return train_x, train_t #train_x=(data_size, data_length, 1), train_t=(data_size, 1)

training_size = 500
data_length=50
epoch_num = 100
hidden_size = 5

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

True


In [4]:
agent = ActorCritic(data_length, device)
reward_history = []
loss_pi_history = []
loss_v_history = []

In [5]:
for epoch in range(epoch_num):
    total_reward = 0
    total_loss_pi = 0
    total_loss_v = 0
    train_x, train_t = mkDataSet(training_size)
    for ep in range(training_size-1):
        state = torch.tensor([train_x[ep]]).to(device)
        done = False

        action, probs = agent.get_action(state)
        next_state = torch.tensor([train_x[ep+1]]).to(device)

        if action == 0:
            reward = next_state[0][-1] - state[0][-1]
        else:
            reward = torch.tensor([0.]).to(device)

        total_reward += reward.to('cpu')

        loss_v, loss_pi = agent.update(state, torch.clip(probs, 1e-8, 1-1e-8), reward.clone(), next_state)
        total_loss_v += loss_v
        total_loss_pi += loss_pi

        state = next_state
        
    loss_v_history.append(total_loss_v)
    loss_pi_history.append(total_loss_pi)
    reward_history.append(total_reward)

#    if epoch % 10 ==0:
    print(epoch, total_loss_v, total_loss_pi, total_reward)

0 1.164439817129412 0.8738823785406566 tensor([0.5137])
1 0.8716311198060396 0.2316014367970638 tensor([1.4707])
2 0.8952502487953033 0.3747714642668143 tensor([1.2409])
3 0.9795194690981397 0.9910590088256868 tensor([1.3737])
4 0.8893827543793984 -1.2356137996248435 tensor([-0.6891])
5 1.0569491609995616 0.7355725899687968 tensor([0.9184])
6 0.9095665840411309 -0.6788822570679258 tensor([0.5156])
7 0.9288165920233098 1.2398696361051407 tensor([1.0107])
8 0.8503467613261293 0.02837352576898411 tensor([1.8958])
9 0.8995096144229067 0.17019108691965812 tensor([1.5941])
10 1.0237795188751733 -1.224956468315213 tensor([-0.3448])
11 0.8898369985256844 1.1255795018005301 tensor([1.3127])
12 0.9096530876513294 -0.47070199668542045 tensor([1.2482])
13 0.8693797200375943 -0.05434943099680822 tensor([0.3736])
14 0.9049937016523728 0.23871739843161777 tensor([1.1052])
15 0.8996834759305123 0.8016598404501565 tensor([1.9596])
16 0.9284801492929873 -0.44345974349198514 tensor([1.0649])
17 0.9111922

KeyboardInterrupt: 

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot(np.array(reward_history), color = "r")

fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot(np.array(loss_pi_history), color = "r")

fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot(np.array(loss_v_history), color = "r")