In [1]:
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
from torch.distributions import Categorical

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline


# == recnn ==
import sys
sys.path.append("../../")
import recnn

cuda = torch.device('cuda')

# ---
frame_size = 10
batch_size = 10
n_epochs   = 100
plot_every = 30
step       = 0
num_items    = 5000 # n items to recommend. Can be adjusted for your vram 
# --- 
tqdm.pandas()


from jupyterthemes import jtplot
jtplot.style(theme='grade3')

## I will drop low freq items because it doesnt fit into my videocard vram

In [2]:

def prepare_dataset(df, key_to_id, frame_size, env, sort_users=False):
    
    global num_items
    
    value_counts = df['movieId'].value_counts() 
    print('counted!')
    
    num_items = 5000
    to_remove = df['movieId'].value_counts().sort_values()[:-num_items].index
    to_keep = df['movieId'].value_counts().sort_values()[-num_items:].index
    to_remove_indices = df[df['movieId'].isin(to_remove)].index
    num_removed = len(to_remove)
    
    df.drop(to_remove_indices, inplace=True)
    print('dropped!')
    
    print('before', env.embeddings.size(), len(env.movie_embeddings_key_dict))
    for i in list(env.movie_embeddings_key_dict.keys()):
        if i not in to_keep:
            del env.movie_embeddings_key_dict[i]
        
    env.embeddings, env.key_to_id, env.id_to_key = recnn.data.utils.make_items_tensor(env.movie_embeddings_key_dict)
    
    print('after', env.embeddings.size(), len(env.movie_embeddings_key_dict))
    print('embeddings automatically updated')
    print('action space is reduced to {} - {} = {}'.format(num_items + num_removed, num_removed,
                                                           num_items))
    
    return recnn.data.prepare_dataset(df, env.key_to_id, frame_size, sort_users=sort_users)


In [3]:
def batch_contstate_discaction(batch, item_embeddings_tensor, frame_size, num_items, *args, **kwargs):
    
    """
    Embed Batch: continuous state discrete action
    """
    
    from recnn.data.utils import get_irsu
    
    items_t, ratings_t, sizes_t, users_t = get_irsu(batch)
    items_emb = item_embeddings_tensor[items_t.long()]
    b_size = ratings_t.size(0)

    items = items_emb[:, :-1, :].view(b_size, -1)
    next_items = items_emb[:, 1:, :].view(b_size, -1)
    ratings = ratings_t[:, :-1]
    next_ratings = ratings_t[:, 1:]

    state = torch.cat([items, ratings], 1)
    next_state = torch.cat([next_items, next_ratings], 1)
    action = items_t[:, -1]
    reward = ratings_t[:, -1]

    done = torch.zeros(b_size)
    done[torch.cumsum(sizes_t - frame_size, dim=0) - 1] = 1
    
    one_hot_action = torch.zeros(action.size(0), num_items)
    one_hot_action.scatter_(1, action.view(-1,1), 1)

    batch = {'state': state, 'action': one_hot_action, 'reward': reward, 'next_state': next_state, 'done': done,
             'meta': {'users': users_t, 'sizes': sizes_t}}
    return batch

def embed_batch(batch, item_embeddings_tensor, *args, **kwargs):
    return batch_contstate_discaction(batch, item_embeddings_tensor, frame_size=frame_size, num_items=num_items)

In [4]:
# embeddgings: https://drive.google.com/open?id=1EQ_zXBR3DKpmJR3jBgLvt-xoOvArGMsL
env = recnn.data.env.FrameEnv('../../data/embeddings/ml20_pca128.pkl',
                              '../../data/ml-20m/ratings.csv', frame_size, batch_size,
                              embed_batch=embed_batch, prepare_dataset=prepare_dataset,
                              num_workers = 0)

counted!
dropped!
before torch.Size([27278, 128]) 27278
after torch.Size([5000, 128]) 5000
embeddings automatically updated
action space is reduced to 26744 - 21744 = 5000


HBox(children=(IntProgress(value=0, max=18946308), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18946308), HTML(value='')))




HBox(children=(IntProgress(value=0, max=138493), HTML(value='')))




In [5]:
class DiscretePolicy(nn.Module):
    def __init__(self, hidden_size, num_inputs, num_actions):
        super(DiscretePolicy, self).__init__()

        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_actions)
        
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, inputs):
        x = inputs
        x = F.relu(self.linear1(x))
        action_scores = self.linear2(x)
        return F.softmax(action_scores)
    
    
def select_action(policy, state):
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action, probs

### Because I do not have a dynamic environment, I also will include a critic. If you have a real non static environment, you can do w/o citic.

In [6]:
value_net = recnn.nn.Critic(1290, num_items, 2048, 54e-2).to(cuda)
target_value_net = recnn.nn.Critic(1290, num_items, 2048, 54e-2).to(cuda)

policy_net = DiscretePolicy(2048, 1290, num_items).to(cuda)
target_policy_net = DiscretePolicy(2048, 1290, num_items).to(cuda)

policy_optimizer = torch.optim.Adam(policy_net.parameters(), lr=1e-3)
value_optimizer = torch.optim.Adam(value_net.parameters(), lr=1e-3)

target_policy_net.eval()
target_value_net.eval()

Critic(
  (drop_layer): Dropout(p=0.5, inplace=False)
  (linear1): Linear(in_features=6290, out_features=2048, bias=True)
  (linear2): Linear(in_features=2048, out_features=2048, bias=True)
  (linear3): Linear(in_features=2048, out_features=1, bias=True)
)

In [7]:
def td_update(batch, learn=True):
    
    state, action, reward, next_state, done = recnn.data.get_base_batch(batch)
    
    # Value Learning
    
    with torch.no_grad():
        next_action = target_policy_net(next_state)
        target_value   = target_value_net(next_state, next_action.detach())
        expected_value = reward + (1.0 - done) * 0.99 * target_value
        expected_value = torch.clamp(expected_value, -10, 10)

    value = value_net(state, action)
    value_loss = torch.pow(value - expected_value.detach(), 2).mean()
    
    if learn:
        value_optimizer.zero_grad()
        value_loss.backward()
        value_optimizer.step()
        
    return value_loss

In [14]:
def REINFORCE(policy, optimizer):
    R = torch.tensor([0]).to(cuda)
    policy_loss = []
    returns = []
    for r in policy.rewards[::-1]:
        R = r + 0.99 * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + 0.0001)
    for log_prob, R in zip(policy.saved_log_probs, returns):

        policy_loss.append(-log_prob * R)

    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]
    
    return policy_loss

In [15]:
def learn(batch, step):
    state, action, reward, next_state, done = recnn.data.get_base_batch(batch)
    
    predicted_action, predicted_probs = select_action(policy_net, state)
    reward = value_net(state, predicted_probs).detach()
    policy_net.rewards.append(reward.mean())
    
    value_loss = td_update(batch)
    
    if step % 10 == 0 and step > 0:
        policy_loss = REINFORCE(policy_net, policy_optimizer)
        del policy_net.rewards[:]
        del policy_net.saved_log_probs[:]
        print('step: ', step, '| value:', value_loss.item(), '| policy', policy_loss.item())
    
    recnn.utils.soft_update(value_net, target_value_net, soft_tau=0.001)
    recnn.utils.soft_update(policy_net, target_policy_net, soft_tau=0.001)

In [None]:
step = 0
for epoch in range(n_epochs):
    for batch in tqdm(env.train_dataloader):
        learn(batch, step)
        step += 1

HBox(children=(IntProgress(value=0, max=13155), HTML(value='')))

  from ipykernel import kernelapp as app


step:  10 | value: 3.483903408050537 | policy -661.602294921875
step:  20 | value: 5.725956916809082 | policy -5057.4697265625
step:  30 | value: 3.3426642417907715 | policy 1540.8583984375
step:  40 | value: 4.142497539520264 | policy -324.4228515625
step:  50 | value: 4.274340629577637 | policy -2936.5673828125
step:  60 | value: 3.9849038124084473 | policy -1814.1396484375
step:  70 | value: 4.143095970153809 | policy -2763.79541015625
step:  80 | value: 4.20749568939209 | policy 1278.69873046875
step:  90 | value: 3.7009503841400146 | policy -2046.633544921875
step:  100 | value: 3.5018255710601807 | policy 2278.51318359375
step:  110 | value: 3.5440173149108887 | policy -1018.57861328125
step:  120 | value: 3.3337490558624268 | policy 1508.5556640625
step:  130 | value: 3.275139093399048 | policy -5600.7783203125
step:  140 | value: 4.0005574226379395 | policy 717.43359375
step:  150 | value: 3.958998680114746 | policy 2416.165771484375
step:  160 | value: 3.454111099243164 | poli