## Deep Determenistic Policy Gradients
Work in progress nut it will be finished soon enough

Special thanks to KnightofK9

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import pickle

cuda = torch.device('cuda')
frame_size = 10

In [3]:
ratings = pd.read_csv('../data/ml-20m/ratings.csv')
movies = pickle.load(open('../data/infos_pca128.pytorch', 'rb'))

In [4]:
# credits: KnightofK9
ratings["rating"] = ratings["rating"].apply(lambda i: 2 * (i - 2.5))
users = ratings[["userId","movieId"]].groupby(["userId"]).size()
users = users[users >= frame_size + 1]
ratings = ratings[ratings["userId"].isin(users.index)]
ratings = ratings.sort_values(by=["userId", "timestamp"]).drop(columns=["timestamp"]).set_index("userId")

In [5]:
for i in movies.keys():
    movies[i] = movies[i].to(cuda)

# Model

In [6]:
class StateRepresentation(nn.Module):
    def __init__(self):
        super(StateRepresentation, self).__init__()
        self.lin = nn.Sequential(
            # 128 - embed size, 1 - rating size
            nn.Linear(frame_size * (128 + 1), 256),
            nn.Tanh(),
        )
        
    def forward(self, info, ratings):
        # raw_size - size of the raw movie info. Constant = 2591
        # embed_size - size of an ebedded movie. Constant = 64
        # raw -> embed via embeddings module defined above
        # input: currently info is batch_size x frame_size x raw_size
        # step 1: tramsform info to batch_size x (frame_size * embed_size)
        info = info.view(batch_size, frame_size * 128)
        # step 2: stack info with ratings. stacked: batch_size x (embed_size + 1)
        stacked = torch.cat([info, ratings], 1)
        # step 3: apply state represemtation module
        state = self.lin(stacked)
        return state

In [None]:
class Actor(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(Actor, self).__init__()
        
        self.state_rep = StateRepresentation()
        
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, num_actions)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, info, rewards):
        state = self.state_rep(info, rewards)
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = F.tanh(self.linear3(x))
        return state, x
    
    def get_action(self, info, rewards):
        state, action = self.forward(info, rewards)
        return state, action

In [None]:
class Critic(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(Critic, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, 1)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state, action):
        action = torch.squeeze(action)
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x

## DDPG Code
[Creditals](https://github.com/higgsfield/RL-Adventure-2/blob/master/5.ddpg.ipynb)

In [None]:
def ddpg_update(batch, 
           gamma = 0.99,
           min_value=-5,
           max_value=5,
           soft_tau=1e-2):
    
    state, action, reward, next_state, done = batch
    
    reward     = reward.unsqueeze(1)
    done       = done.unsqueeze(1)
    
    enc_state, current_action = policy_net(*state)
    
    policy_loss = value_net(enc_state, current_action)
    policy_loss = -policy_loss.mean()

    enc_next_state, next_action = target_policy_net(*next_state)
    target_value   = target_value_net(enc_next_state, next_action.detach())
    expected_value = reward + (1.0 - done) * gamma * target_value
    expected_value = torch.clamp(expected_value, min_value, max_value)
    
    value = value_net(enc_state, action)
    
    value_loss = value_criterion(value, expected_value.detach())

    policy_optimizer.zero_grad()
    policy_loss.backward(retain_graph=True)
    policy_optimizer.step()

    value_optimizer.zero_grad()
    value_loss.backward(retain_graph=True)
    value_optimizer.step()

    for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
            )

    for target_param, param in zip(target_policy_net.parameters(), policy_net.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
            )
            
    return value_loss.item(), policy_loss.item()

In [None]:
value_net  = Critic(256, 128, 320).to(cuda)
policy_net = Actor(256, 128, 192).to(cuda)

target_value_net  = Critic(256, 128, 320).to(cuda)
target_policy_net = Actor(256,128, 192).to(cuda)

for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
    target_param.data.copy_(param.data)

for target_param, param in zip(target_policy_net.parameters(), policy_net.parameters()):
    target_param.data.copy_(param.data)
    
    
value_lr  = 10e-3
policy_lr = 10e-4

value_optimizer  = optim.Adam(value_net.parameters(),  lr=value_lr)
policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)
value_criterion = nn.MSELoss()

In [None]:
from tensorboardX import SummaryWriter
writer = SummaryWriter('../runs/')

# Training

In [None]:
n_iter = 1

def prepare_batch(batch):
    watched_infos = []
    watched_rating = []
    chosen_movie = []
    chosen_rating = []
    next_infos = []
    next_rating = []
    done = []
    
    for b in batch:
        watched_infos.append(b[0][0])
        watched_rating.append(torch.from_numpy(b[0][1]))
        chosen_movie.append(b[1])
        chosen_rating.append(b[2])
        next_infos.append(b[3][0])
        next_rating.append(torch.from_numpy(b[3][1]))
        done.append(b[4])
                    
    watched_infos = torch.stack(watched_infos).to(cuda)
    watched_rating = torch.stack(watched_rating).float().to(cuda)
    chosen_movie = torch.stack(chosen_movie).to(cuda)
    chosen_rating = torch.tensor(chosen_rating).to(cuda)
    next_infos = torch.stack(next_infos).to(cuda)
    next_rating = torch.stack(next_rating).float().to(cuda)
    done = torch.tensor(done).float().to(cuda)
    
    return (watched_infos, watched_rating), chosen_movie, chosen_rating, \
           (next_infos, next_rating), done


batch_bar = tqdm(total=len(users))
batch = []
batch_size = 100


for user, df in ratings.groupby(level=0):
    batch_bar.update(1)
    size = max(len(df) - frame_size, 0)
    for idx in range(0, size):
        
        if np.random.rand() < 0.8:  # intake percents
            continue
            
        user_ratings = df[idx:frame_size + idx + 1]
        user_ratings = user_ratings[["movieId", "rating"]].values

        chosen_movie = user_ratings[:, 0][-1] 
        chosen_movie = movies[chosen_movie] # action
        chosen_rating = user_ratings[:, 1][-1] # reward
        films_watched = user_ratings[:, 0][:-1] 
        watched_rating = user_ratings[:, 1][:-1] # state
        watched_infos = [movies[i] for i in films_watched] # state
        watched_infos = torch.stack(watched_infos)
        next_infos = torch.cat((watched_infos[1:], chosen_movie.unsqueeze(0)), 0)
        next_rating = watched_rating[1:].tolist()
        next_rating.append(chosen_rating)
        next_rating = np.array(next_rating)

        # state action reward next_state done
        batch.append([(watched_infos, watched_rating), chosen_movie, chosen_rating,
                       (next_infos, next_rating), idx + 1 == size])
                
        if len(batch) >= batch_size:
            # train here
            batch = prepare_batch(batch)
            value_loss, policy_loss = ddpg_update(batch)
            writer.add_scalar('loss/value', value_loss, n_iter)
            writer.add_scalar('loss/policy', policy_loss, n_iter)
            n_iter += 1
            batch = []

HBox(children=(IntProgress(value=0, max=138493), HTML(value='')))



37241

In [38]:
movies[1].abs() > 1

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0', dtype=torch.uint8)