# **REINFORCE with Local Differential Privacy  on Cartpole**
## In this notebook, we propose a local differentially private REINFORCE by disturbing observations i.e. the states seen by the model. We add a gaussian noise to the states before feeding the REINFORCE.

## Acknowledgement

Code adapted from [this notebook](https://github.com/bentrevett/pytorch-rl/blob/master/dqn_working.ipynb).


## Import

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as distributions

import matplotlib.pyplot as plt
import numpy as np
import gym
import os

## Initialize environments

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_env = gym.make('CartPole-v1')
test_env = gym.make('CartPole-v1')

In [None]:
MEAN = torch.zeros(4)
STD = torch.tensor([0.55, 0.39, 0.05, 0.32])

## Seed

In [None]:
SEED = 42

train_env.reset(seed=SEED);
test_env.reset(seed=SEED+1);
np.random.seed(SEED);
torch.manual_seed(SEED);

In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout = 0.5):
        super().__init__()

        self.fc_1 = nn.Linear(input_dim, hidden_dim)
        self.fc_2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc_1(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.fc_2(x)
        return x

In [None]:
INPUT_DIM = train_env.observation_space.shape[0]
HIDDEN_DIM = 128
OUTPUT_DIM = train_env.action_space.n

In [None]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight)
        m.bias.data.fill_(0)

## Training

In [None]:
def train(env, policy, optimizer, discount_factor, sigma=None):
    
    policy.train()
    
    log_prob_actions = []
    rewards = []
    done = False
    episode_reward = 0
    n_samples = 0
    state, _ = env.reset()
    
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0)

        # Add noise on the state
        if sigma:
            noise = torch.normal(mean=MEAN, std=sigma*STD)
            state += noise

        action_pred = policy(state)
        
        action_prob = F.softmax(action_pred, dim=-1)
                
        dist = distributions.Categorical(action_prob)

        action = dist.sample()
        
        log_prob_action = dist.log_prob(action)
        
        state, reward, done, truncated, _ = env.step(action.item())
        done = done or truncated

        log_prob_actions.append(log_prob_action)
        rewards.append(reward)
        n_samples += 1
        episode_reward += reward

    log_prob_actions = torch.cat(log_prob_actions)
        
    returns = calculate_returns(rewards, discount_factor)
        
    loss = update_policy(returns, log_prob_actions, optimizer)

    return loss, episode_reward, n_samples

In [None]:
def calculate_returns(rewards, discount_factor, normalize = True):
    
    returns = []
    R = 0
    
    for r in reversed(rewards):
        R = r + R * discount_factor
        returns.insert(0, R)
        
    returns = torch.tensor(returns)
    
    if normalize:
        returns = (returns - returns.mean()) / returns.std()
        
    return returns

In [None]:
def update_policy(returns, log_prob_actions, optimizer):
    
    returns = returns.detach()
    
    loss = - (returns * log_prob_actions).sum()
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()
    
    return loss.item()

In [None]:
def evaluate(env, policy, sigma=None):
    
    policy.eval()
    
    done = False
    episode_reward = 0

    state, _ = env.reset()

    while not done:     
        state = torch.FloatTensor(state).unsqueeze(0)
        
        with torch.no_grad():
        
            action_pred = policy(state)
        
            action_prob = F.softmax(action_pred, dim = -1)
                            
        action = torch.argmax(action_prob, dim = -1)
       
        state, reward, done, truncated, _ = env.step(action.item())
        done = done or truncated

        episode_reward += reward
        
    return episode_reward

In [None]:
total_nb_episodes = 1000
gamma = 0.99

lr = 0.01
n_trials = 25

noise_levels = [0, 0.01, 0.1, 0.2, 0.3, 0.5, 2, 5] 

print_every = 25

run_0 = 100
nb_runs = 100

os.makedirs('experiments/reinforce_LDP', exist_ok=True)

for run in range(nb_runs):
    print('Starting run {}: '.format(run))
    train_rewards = np.zeros((len(noise_levels), total_nb_episodes))
    test_rewards = np.zeros((len(noise_levels), total_nb_episodes))

    for i, sigma in enumerate(noise_levels):
        train_env.reset(seed=SEED);
        test_env.reset(seed=SEED+1);

        print("-------------- Noise: sigma = {} ---------------\n".format(sigma))
        policy = MLP(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)
        policy.apply(init_weights)

        optimizer = optim.Adam(policy.parameters(), lr=lr)
        convergence = False

        for episode in range(total_nb_episodes):
            if not convergence:
                loss, train_reward, _ = train(train_env, policy,
                                                      discount_factor = gamma, optimizer=optimizer, sigma=sigma)
                test_reward = evaluate(test_env, policy, sigma=sigma)

                train_rewards[i][episode] = train_reward
                test_rewards[i][episode] = test_reward

                mean_train_rewards = np.mean(train_rewards[i][episode-n_trials:episode+1])
                mean_test_rewards = np.mean(test_rewards[i][episode-n_trials:episode+1])

                convergence = (mean_test_rewards) >= 475
                if convergence: print("Achieved convergence")
            else:
                train_rewards[i][episode] = train_rewards[i][episode-1]
                test_rewards[i][episode] = test_rewards[i][episode-1]

            if episode % print_every == 0:
                print(f'| Episode: {episode:3} | Train Reward: {train_reward:5.1f} | Test Reward: {test_reward:5.1f} |')
        
    np.save('experiments/reinforce_LDP/rewards_run_{}.npy'.format(run_0 + run), test_rewards)


In [None]:
fig = plt.figure(figsize=(12,8))

for i, sigma in enumerate(noise_levels):
    plt.plot(test_rewards[i], label='sigma = {}'.format(sigma))

plt.xlabel('Number of episodes trained')
plt.ylabel('Reward')

plt.legend()
plt.legend(loc='lower right')
plt.grid()
plt.show()