# REINFORCE for different environments

## 1. Import the libraries 

In [None]:
import numpy as np
import matplotlib.pyplot as plt #plotting library
from matplotlib import animation #animated visualizations library
from collections import namedtuple, deque 
#nametuple creates tuple subclasses with name fields, access elements by names instead of index
#deque (double-ended queue) for adding and removing elements from both ends
from tqdm import tqdm
#add progress bars to Python code for easy monitoring progress of loops and tasks
# %matplotlib inline 
import gym #environments for agents
from datetime import datetime #manipulating dates and times
import pandas as pd #work with structured data
import torch #Pytorch supports tensor computations and neural networks
import torch.nn as nn #Pytorch supports building neural networks
import torch.nn.functional as F
#common functions in neural network operations 
    # Activation functions (ReLU, sigmoid, tanh)
    # Loss functions (cross_entropy, mse_loss)
    #Utility functions for tensor manipulation (softmax, dropout, batch_norm, etc.)
import torch.optim as optim #optimization algorithms for training neural networks
import random #generate random numbers/selections
from collections import namedtuple, deque 
import itertools 
# provides various functions for creating iterators and combining them for complex interators
# includes cycle, chain, zip, etc.

from torch.autograd import Variable
import torch.autograd as autograd

from torch.distributions import Normal

from __future__ import print_function

In [None]:
!pip install --upgrade tqdm
!pip install --upgrade torch
!pip install --upgrade gym
!pip install --upgrade numpy

# Current Gym version: 0.26.2

## 2. Algorithm implementation

In [None]:
""" Monte-Carlo Policy Gradient """



class reinforce(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units = 64, fc2_units = 64):
        super(reinforce, self).__init__()
        # policy network
        self.seed = torch.manual_seed(seed)
        state_size = env.observation_space.shape[0]
#         action_size= env.action_space.n
        action_size = env.action_space.shape[0]
        self.action_size = action_size
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.relu = nn.ReLU(inplace=True)
        self.tanh = nn.Tanh()
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)
        self.softmax = nn.Softmax()
        self.logstd = nn.Parameter(torch.zeros(1, action_size))

    def forward(self, state):
        x = self.fc1(state)
        x = self.tanh(x)
        x = self.fc2(x)
        x = self.tanh(x)
        x = self.fc3(x)

#         x = self.softmax(x)

            # Output both mean and log standard deviation
        action_mean = x

        action_logstd = self.logstd

        action_logstd = action_logstd.squeeze()


        if len(action_logstd.shape) == 0:
            action_logstd = action_logstd.unsqueeze(0)  # Convert scalar to tensor with shape (1,)
            concatenated = torch.cat((action_mean, action_logstd), dim=0)

        else:
            concatenated = torch.cat((action_mean, action_logstd.squeeze()), dim=0)
        return concatenated #
#         return torch.cat((action_mean, action_logstd), dim=1)
    


    def get_action(self, state, action_size):
        state = torch.tensor(state, dtype=torch.float32)
        probs = self.forward(state)

        # Assuming your policy network outputs both mean and log standard deviation
        action_mean = probs[:self.action_size]

        action_logstd = probs[self.action_size:]


        action_std = torch.exp(action_logstd)

        # Sample from a normal distribution with the mean and standard deviation
        normal = Normal(action_mean, action_std)
        action = normal.sample()

        # Clip action to ensure it's within the valid range
        action = torch.clamp(action, -1.0, 1.0)  # Assuming action range is [-1, 1]


        return action.detach().numpy()  # Convert PyTorch tensor to NumPy array
    

    def pi(self, state, action):

        state = torch.tensor(state)  
        probs = self.forward(state)

        return probs[action] + 0.000000001

    def update_weight(self, states, actions, rewards, optimizer):
        G = 0
        # for each step of the episode t = T - 1, ..., 0
        # r_tt represents r_{t+1}
        for s_t, a_t, r_tt in zip(states[::-1], actions[::-1], rewards[::-1]):
            print("State input: ", s_t)
            print("-------------------------------------------")
            print("Action: ", a_t)
            print("-------------------------------------------")
            print("Immediate reward: ", r_tt)
            print("-------------------------------------------")
            G = r_tt + GAMMA * G  # No need for Variable wrapper
            print("Return: ", G)
            print("-------------------------------------------")
            loss = (-1.0) * G * torch.log(self.pi(s_t, a_t))
            loss = loss[0] 
            
#             print("Loss ((-1.0) * G * torch.log(policy of action given state): ", loss)
            # update policy parameter \theta
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
#         print("Return outside loop: ", G)
#         print("-------------------------------------------")
        return (loss)
        

## 3. Training implementation 

In [None]:
env_names = ['MountainCarContinuous-v0'] #,'LunarLanderContinuous-v2'
seeds = [1,37,42] #
MAX_EPISODES = 2000


ALPHA = 0.01
GAMMA = 1
# action_size = env.action_space.n

for i in env_names:
    print("ENVIRONMENT:-----------", i)
    env = gym.make(i)
    MAX_TIMESTEPS = 200 #env.spec.max_episode_steps
    res = []
    for seed in seeds:
        print("Seed: ---------------------", seed)
        rewards_nonaver = []
        aver_reward = []
        aver = deque(maxlen = 100)
        state_size = env.observation_space.shape[0]
#         print("State_size: ", state_size)
        action_size = env.action_space.shape[0]
        print("Action size", action_size)
        agent = reinforce(state_size, action_size, seed)
        optimizer = optim.Adam(agent.parameters(), lr=ALPHA)
    
        for i_episode in tqdm(range(1,MAX_EPISODES+1)):
            state = env.reset()
            states = []
            actions = []
            rewards = [0]   # no reward at t = 0
            score = 0
            timestep = 0
            while True:
                
                state = torch.tensor(state[0])
#                 state = torch.tensor(state)

                action = agent.get_action(state, action_size)
                states.append(state)
                actions.append(action)
                step_result = env.step(action)
                next_state, reward, done, _ = step_result[:4]
#                 state = next_state
                state = (next_state, {})
                rewards.append(reward)

                score = score + reward

                timestep = timestep + 1
                if timestep == MAX_TIMESTEPS:
                    break
                if done:
#                     print("Episode {} finished after {} timesteps".format(i_episode, timesteps+1))
                    break
            print(f"After finish episode {i_episode}, we obtain trajectory: ")
            print("-------------------------------------------")
            print("States: ", states)
            print("---------------------------------------------")
            print("Actions: ", actions)
            print("---------------------------------------------")
            print("Rewards ", rewards)
            print("---------------------------------------------")
            agent.update_weight(states, actions, rewards, optimizer) 
            if i_episode <10 or i_episode > 1980:
                print(f"For episode {i_episode}")
                print("Loss: ", agent.update_weight(states, actions, rewards, optimizer))
            
            
        
            aver.append(score)
            aver_reward.append(np.mean(aver))
            rewards_nonaver.append(score)
            
#             print("Episode end with :", timesteps)
        
        
        policy_filename = f"model/Seed{seed}_{i}_REINFORCE_{datetime.now().strftime('%Y%m%d%H%M%S')}.pt"
        torch.save(agent.state_dict(), policy_filename)
        res.append(aver_reward)
#         res.append(rewards_nonaver) #in case want to retrieve non smooth result
        print("SAVED MODEL SUCCESSFULLY")

        
    fig = plt.figure()
    
    reward_name = 'plots/' + i + '_REINFORCE_result' + str(datetime.now().strftime("%Y%m%d%H%M%S"))
    df = pd.DataFrame({str(seeds[0]): res[0], str(seeds[1]): res[1], str(seeds[2]): res[2]})  # Use seed as column labels
    df.to_csv(reward_name + '.csv')
    print("Saved rewards for plot successfully")
    
    env.close()
    print("------------------------End Environment-------------------")
        
    plt.xlabel("Episode")
    plt.ylabel("Reward")

    # Plot rewards for each seed
    for seed in seeds:
        plt.plot(df[str(seed)], label='Seed ' + str(seed))

    plt.title('Learning Curve ' + i)

    # Insert the legends in the plot
    fig.legend(loc='lower right')
    fig.savefig(reward_name + '.png', dpi=100)
    
    

## Simulation 

In [None]:
import torch
import gym

# Step 1: Load the Policy Model
model = torch.load('model/Seed42_MountainCarContinuous-v0_REINFORCE_20240502160736.pt')

# Step 2: Set up Environment
seed = 1
env_name = 'MountainCarContinuous-v0' #'LunarLanderContinuous-v2'
env = gym.make(env_name, render_mode = 'human')
# env = gym.make(env_name) #, render_mode = 'human' , render_mode = 'human'
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
agent = reinforce(state_size, action_size, seed)

# Step 3: Run Simulation
num_episodes = 10  # You can adjust the number of episodes you want to run
for episode in range(num_episodes):
    state = env.reset()
    env.render()
    done = False
    total_reward = 0
    timesteps = 1
    while not done:
        
        
        # Convert state to tensor and add batch dimension if needed
        state = torch.tensor(state[0])
        
        # Get action probabilities from the policy
        action = agent.get_action(state, action_size)
        
        step_result = env.step(action)
        next_state, reward, done, _ = step_result[:4]
        
        state = (next_state, {})
#         state = next_state
        
        # Update total reward
        total_reward += reward
        
        timesteps = timesteps + 1
        
    
    print(f"Episode {episode+1} ends in {timesteps} timesteps, Total Reward: {total_reward}")

env.close()  # Close the environment after running the simulation

## Shaded plot 

In [None]:
# Step 1: Read Data from CSV
df = pd.read_csv('plots/MountainCarContinuous-v0_REINFORCE_result20240428024956.csv')

# Step 2: Remove the first row (Episodes, Run 1, Run 2, Run 3) to keep only the rewards data
# df = df.drop(0)

# Step 3: Convert the remaining DataFrame to numeric values
df = df.apply(pd.to_numeric)

# Step 4: Select only the three late columns (Run 1, Run 2, Run 3)
later_columns = df.iloc[:, 1:4]

# Step 5: Calculate Mean and Standard Deviation for the three later columns
mean_values = later_columns.mean(axis=1)
std_values = later_columns.std(axis=1)
print("Standard deviation: ", std_values)
print("Mean: ", mean_values)

# Step 6: Plot the Data
plt.figure(figsize=(12, 6))
plt.plot(mean_values, label='Mean Sum of Rewards', color = "red")
plt.fill_between(mean_values.index, mean_values - std_values, mean_values + std_values, alpha=0.5, label='Standard Deviation', color = "orange")
plt.xlabel('Episode')
plt.ylabel('Sum of Rewards')
plt.title('Shaded Plot for REINFORCE Algorithm, MountainCarContinuous-v0')
plt.savefig('plots/shaded_plot_REINFORCE_MountainCarContinuous.png')
plt.legend()
plt.show()