## LSTM state encoder
P.S. This snippet uses library varsion of the learning function, you can see the visualization in the tensorboard

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

In [9]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import pickle
import gc
import json
import h5py

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline


# == recnn ==
import sys
sys.path.append("../../")
import recnn

cuda = torch.device('cuda')

# ---
frame_size = 10
batch_size = 25
n_epochs   = 100
plot_every = 30
step       = 0
# --- 

tqdm.pandas()

In [10]:
state_encoder = nn.LSTM(129, 256, batch_first=True).to(cuda)

env = recnn.env.SeqEnv('../../data/embeddings/ml20_pca128.pkl',
                         '../../data/ml-20m/ratings.csv', batch_size, state_encoder, cuda)

HBox(children=(IntProgress(value=0, max=20000263), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20000263), HTML(value='')))

HBox(children=(IntProgress(value=0, max=138493), HTML(value='')))

In [11]:
def run_tests():
    batch = next(env.test_batch())
    loss = ddpg_update(batch, params, nets, optimizer,
                       cuda, writer, step=step, learn=False)
    return losses

In [12]:
# === ddpg settings ===

params = {
    'gamma'      : 0.99,
    'min_value'  : -10,
    'max_value'  : 10,
    'policy_step': 10,
    'soft_tau'   : 0.001,
    
    'policy_lr'  : 1e-5,
    'value_lr'   : 1e-5,
    'actor_weight_init': 54e-2,
    'critic_weight_init': 6e-1,
}

# === end ===

In [13]:

value_net  = recnn.models.Critic(256, 128, 256, params['critic_weight_init']).to(cuda)
policy_net = recnn.models.Actor(256, 128, 256, params['actor_weight_init']).to(cuda)

target_value_net = recnn.models.Critic(256, 128, 256).to(cuda)
target_policy_net = recnn.models.Actor(256, 128, 256).to(cuda)

target_policy_net.eval()
target_value_net.eval()


recnn.learning.soft_update(value_net, target_value_net, soft_tau=1.0)
recnn.learning.soft_update(policy_net, target_policy_net, soft_tau=1.0)

# optim.Adam can be replaced with RAdam
pm = list(policy_net.parameters()) + list(state_encoder.parameters())
value_optimizer = recnn.optim.RAdam(value_net.parameters(),
                              lr=params['value_lr'], weight_decay=1e-2)
policy_optimizer = recnn.optim.RAdam(pm, lr=params['policy_lr'] , weight_decay=1e-2)

nets = {
    'value_net': value_net,
    'target_value_net': target_value_net,
    'policy_net': policy_net,
    'target_policy_net': target_policy_net,
}

optimizer = {
    'policy_optimizer': policy_optimizer,
    'value_optimizer':  value_optimizer
}

writer = SummaryWriter(log_dir='../../runs')

In [14]:
def ddpg_update(batch, params, nets, optimizer, device, writer=False, learn=True, step=-1):
    batch = [i.to(device) for i in batch]
    state, action, reward, next_state = batch
    # reward = reward.unsqueeze(1)

    # --------------------------------------------------------#
    # Value Learning

    with torch.no_grad():
        next_action = nets['target_policy_net'](next_state)
        target_value = nets['target_value_net'](next_state, next_action.detach())
        expected_value = reward + params['gamma'] * target_value
        expected_value = torch.clamp(expected_value,
                                     params['min_value'], params['max_value'])

    value = nets['value_net'](state, action)

    value_loss = torch.pow(value - expected_value.detach(), 2).mean()

    if learn:
        optimizer['value_optimizer'].zero_grad()
        value_loss.backward(retain_graph=True)
        optimizer['value_optimizer'].step()

    elif not learn:
            writer.add_figure('next_action',
                                        plot.pairwise_distances_fig(next_action[:50]), step)
            writer.add_histogram('value', value, step)
            writer.add_histogram('target_value', target_value, step)
            writer.add_histogram('expected_value', expected_value, step)

    # --------------------------------------------------------#
    # Policy learning

    gen_action = nets['policy_net'](state)
    policy_loss = -nets['value_net'](state, gen_action)

    if not learn:
        writer['gen_action'] = gen_action
        writer.add_histogram('policy_loss', policy_loss, step)
        writer.add_figure('next_action',
                          plot.pairwise_distances_fig(gen_action[:50]), step)
    policy_loss = policy_loss.mean()

    if learn and step % params['policy_step'] == 0:
        optimizer['policy_optimizer'].zero_grad()
        policy_loss.backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(nets['policy_net'].parameters(), -1, 1)
        optimizer['policy_optimizer'].step()

        recnn.learning.soft_update(nets['value_net'], nets['target_value_net'], soft_tau=params['soft_tau'])
        recnn.learning.soft_update(nets['policy_net'], nets['target_policy_net'], soft_tau=params['soft_tau'])

    losses = {'value': value_loss.item(), 'policy': policy_loss.item(), 'step': step}
    return losses


In [15]:
step = 0
for batch in tqdm(env.train_batch()):
        loss = ddpg_update(batch, params, nets, optimizer,
                       cuda, writer, step=step)
        #if step % 10 == 0:
        #    run_tests()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5263), HTML(value='')))

AttributeError: 'tuple' object has no attribute 'size'

In [13]:
def generator():
    for batch in env.train_dataloader:
        batch = [i.to(env.device) for i in batch]
        items, ratings, sizes = batch
        hidden = None
        state = None
        for t in range(int(sizes.min().item()) - 1):
            action = items[:, t]
            reward = ratings[:, t].unsqueeze(-1)
            s = torch.cat([action, reward], 1).unsqueeze(0)
            next_state, hidden = env.state_encoder(s, hidden) if hidden else env.state_encoder(s)
            next_state = next_state.squeeze()
            print(t, reward.size(), env.train_buffer.len(), )

            if np.random.random() > 0.95 and state is not None:
                batch = [state, action, reward, next_state]
                print('append')
                env.train_buffer.append(batch)

            if env.train_buffer.len() >= env.max_buf_size:
                g = env.train_buffer.get()
                env.train_buffer.flush()
                yield g

            state = next_state