## Imports

In [146]:
%load_ext autoreload
%autoreload 2

from utils.prepare_data import prepare_short_data, prepare_full_data
from environment_TD import Environment
from datetime import timedelta
from models.base_model import Base_model
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [73]:
from typing import List, Optional, Iterator
from tqdm.notebook import tqdm

import numpy as np

import torch
import torch.nn as nn
from torch.nn.parameter import Parameter

## Constants

In [3]:
df = prepare_full_data('data/Data_RU.xlsx')
env = Environment(data=df, stock_name='AFKS', initial_money=1000)

In [4]:
start_date = env.first_date()

In [5]:
day_count = env.data.shape[0]

# TD model

In [139]:
class ActorModel(nn.Module):
    """Actor nn model"""

    def __init__(self, 
                 actor_layer_size: int = 100, 
                 actor_in_size: int = 10, 
                 actor_out_size: int = 3, 
                ) -> None:
        """
        Init method
        :param actor_layer_size: hidden layer's size
        :param constraints_linear: linear constraints
        :param constraints: angle constraints
        """
        super().__init__()
        self.actor = nn.Sequential(
            torch.nn.Linear(actor_in_size, actor_layer_size),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(actor_layer_size, actor_layer_size),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(actor_layer_size, actor_out_size),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Main forward method
        :param x: (batch_size, 6) state's tensor
        """
        x = self.actor(x)
        return x


class CriticModel(nn.Module):
    """Critic nn model"""

    def __init__(self, critic_layer_size: int = 70,
                       critic_in_size: int = 10, 
                       critic_out_size: int = 1, 
                       scale_factor: int = -5000) -> None:
        """
        Init method
        :param critic_layer_size: hidden critic's size
        :param scale_factor: scaling for model's output
        """
        super().__init__()
        self.critic = torch.nn.Sequential(
            torch.nn.Linear(critic_in_size, critic_layer_size),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(critic_layer_size, critic_layer_size),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(critic_layer_size, critic_out_size)
        )
        self.act = torch.nn.Tanh()
        self.scale_factor = scale_factor

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Main forward method
        :param x: (batch_size, 6) state's tensor
        """
        x = self.critic(x)
        x = -(x ** 2)
        x = self.act(x)
        return x * self.scale_factor

In [140]:
class CriticTD(nn.Module):
    """
    Critic Temporal Difference model
    """

    def __init__(self, actor ,
                       critic, #: CriticModel, 
                       transition, #: CameraTransition,
                       env,
                       satellite_discount: float = .98) -> None:
        """
        Init method
        :param actor: Actor nn model
        :param critic: Critic nn model
        :param transition: Camera Transition model
        :param satellite_discount: satellite discount for TD method
        """
        super().__init__()
        self.actor = actor
        self.critic = critic
        self.transition = transition
        self.env = env
        self.satellite_discount = satellite_discount
        self.loss = nn.MSELoss()

    def forward(self, state: torch.Tensor, indexes) -> torch.Tensor:
        """
        Main forward method
        :param state: (batch_size, 6) camera's state
        :return: TD loss
        """
        with torch.no_grad():
            action = self.actor(state)
            next_state, reward = self.transition(state, action, indexes, self.env)
            td_target = reward + self.satellite_discount * self.critic(next_state)
        value = self.critic(state)
        return self.loss(value, td_target)

    def parameters(self) -> Iterator[Parameter]:
        return self.critic.parameters()


class ActorImprovedValue(nn.Module):

    def __init__(self, actor, #: ActorModel,
                       critic, #: CriticModel, 
                       transition, #: CameraTransition,
                       env,
                       satellite_discount: float = .98) -> None:
        """
        Init method
        :param actor: Actor nn model
        :param critic: Critic nn model
        :param transition: Camera Transition model
        :param satellite_discount: satellite discount for TD method
        """
        super().__init__()
        self.critic = critic
        self.actor = actor
        self.transition = env.transition
        self.env = env
        self.satellite_discount = satellite_discount

    def forward(self, state, index_):
        """
        Main forward method
        :param state: (batch_size, 6) camera's state
        :return: actor's improved value
        """
        action = self.actor(state)
        next_state, reward = self.transition(state, action, indexes, self.env)
        improved_value = reward + self.satellite_discount * self.critic(next_state)
        return -improved_value.mean()

    def parameters(self) -> Iterator[Parameter]:
        return self.actor.parameters()

In [141]:
def td_transition(state, action, indexes, env):    
    action = (torch.argmax(action) - 1).numpy()
    new_state, reward = env.transition_batch(action, indexes)
    return new_state, reward


In [142]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

actor = ActorModel(actor_in_size=102).to(device)
critic = CriticModel(critic_in_size=102).to(device)

critic_temporal_difference = CriticTD(actor, critic, td_transition, env).to(device)
actor_improved_value = ActorImprovedValue(actor, critic, td_transition, env).to(device)


model = Base_model()
# here we have to train the model

In [143]:
optimizer_critic_kind = torch.optim.Adam
optimizer_critic_parameters = {
    'lr': 5e-6,
    'weight_decay': 1e-5
}

optimizer_actor_kind = torch.optim.Adam
optimizer_actor_parameters = {
    "lr" : 1e-4,
    "weight_decay" : 1e-5
}

critic_iterations = 2000
critic_batch_size = 4000

actor_iterations = 2000
actor_batch_size = 2000

epochs = 5

In [144]:
def get_random_state(batch_size: int,
                     env,
                     index_min: int = 10,
                     index_max: int = 11, 
                     ):
    
    rand_indexes = np.random.randint(index_min, index_max, batch_size)
    
    X = []
    
    for index in rand_indexes:
        state_dict = env.observation(index)
        state_array = np.concatenate([state_dict['prices'].drop(['Date'], axis=1).values.flatten(), 
                                      np.array(state_dict['money']),
                                      np.array(state_dict['stocks_num'])
                                     ])
        X.append(state_array)
        
    return torch.Tensor(X), torch.tensor(rand_indexes)


def critic_epoch(optimizer: torch.optim.Optimizer,
                 model: CriticTD, 
                 iterations: int,
                 env,
                 batch_size: int) -> List[float]:
    losses = []
    for iteration in tqdm(range(iterations), "Critic epoch"):
        X, indexes = get_random_state(batch_size, env, index_max=env.data.shape[0]-1)
        X, indexes = X.to(device), indexes.to(device)

        optimizer.zero_grad()
        
        loss = model(X, indexes)
        loss.backward()
        optimizer.step()
        losses.append(loss.detach().cpu().numpy())

    print(f"Critic mean loss: {np.mean(losses)}")
    return losses

def actor_epoch(optimizer: torch.optim.Optimizer,
                 model: CriticTD, 
                 iterations: int, 
                 env,
                 batch_size: int) -> List[float]:
    values = []
    for iteration in tqdm(range(iterations), "Actor epoch"):
        X, indexes = get_random_state(batch_size, env, index_max=env.data.shape[0]-1)
        X, indexes = X.to(device), indexes.to(device)

        optimizer.zero_grad()
        improved_value = model(X, indexes)
        improved_value.backward()
        optimizer.step()
        values.append(improved_value.detach().cpu().numpy())
    print(f"Actor mean value: {np.mean(values)}")
    
    return values

In [145]:
optimizer_actor = optimizer_actor_kind(actor_improved_value.parameters(), **optimizer_actor_parameters)

for _ in tqdm(range(epochs), "Actor-Critic learning", leave=False):
    optimizer_critic = optimizer_critic_kind(critic_temporal_difference.parameters(), **optimizer_critic_parameters)
    losses = np.array(critic_epoch(optimizer_critic,
                                   critic_temporal_difference,
                                   critic_iterations,
                                   env,
                                   actor_batch_size))
    actor_epoch(optimizer_actor,
                actor_improved_value,
                actor_iterations,
                env,
                actor_batch_size)
    plt.plot(losses)
    plt.show()

Actor-Critic learning:   0%|          | 0/5 [00:00<?, ?it/s]

Critic epoch:   0%|          | 0/2000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
env.reset()
for i in range(10, day_count):
    obs = env.observation(i)
    action = model.predict(obs)
    env.transition(action, i)
    if i % 100 == 0:
        print(i)

In [None]:
env.money

In [None]:
plt.plot(range(9, day_count), env.money)

In [None]:
plt.plot(range(9, day_count), env.stocks)