In [None]:
# all necessary imports
import numpy as np 
import matplotlib.pyplot as plt
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from gym import spaces
import math

ENVIRONMENT

In [2]:
class CustomPongEnv(gym.Env):
    metadata = {"render_modes": ["human"]}
    
    def __init__(self, reward_fn=None):
        super().__init__()

        # pass in reward function as it changes stage to stage
        self.reward_fn = reward_fn 

        # initial game state
        self.width, self.height = 1.0, 1.0
        self.paddle_y = 0.5
        
        # action space: 0 = stay, 1 = up, 2 = down
        self.action_space = spaces.Discrete(3)

         # observation space: [ball_x, ball_y, ball_vx, ball_vy, paddle_y]
        self.observation_space = spaces.Box(
            low=0.0, high=1.0, shape=(5,), dtype=np.float32
        )

        self.episode_count = 0

    def reset(self):
        self.episode_count += 1      
        return self.get_obs()
    
    def step(self, action):
        if action == 1:
            self.paddle_y += self.paddle_speed # move up
        elif action == 2:
            self.paddle_y -= self.paddle_speed # move down

        # make sure paddle doesnt go off screen
        self.paddle_y = np.clip(self.paddle_y, self.paddle_size /2, 1 - self.paddle_size /2) 

        ## move the ball
        self.ball_x += self.ball_vx
        self.ball_y += self.ball_vy

        reward, done = self.reward_fn(self)

        return self.get_obs(), reward, done, {}
    
    def get_obs(self):
        return np.array([
            self.ball_x,
            self.ball_y,
            self.ball_vx,
            self.ball_vy,
            self.paddle_y
        ], dtype = np.float32)
    
    # initialize environment difficulty parameters    
    def setDifficulty(self, ball_vx, ball_vy, paddle_size, ball_x, ball_y, paddle_speed):
        self.paddle_size = paddle_size
        self.ball_x = ball_x
        self.ball_y = ball_y
        self.ball_vx = ball_vx 
        self.ball_vy = ball_vy 
        self.paddle_speed = paddle_speed



PPO + Curriculum Learning

In [4]:
from stable_baselines3.common.callbacks import BaseCallback

# reward callback to track rewards during episodes
class RewardCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.total_timesteps = 0
        self.timesteps = []

        self.cumulative_reward = 0
        self.rewards = []
        
        self.current_episode = 0
        self.episode_number = []

        self.current_episode_reward = 0
        self.episode_rewards = []
        
        self.current_episode_length = 0
        self.episode_lengths = []
        

    def _on_step(self) -> bool:
        # get current reward and done (if episode ends)
        reward = self.locals["rewards"][0]
        done = self.locals["dones"][0]

        self.total_timesteps += 1
        self.cumulative_reward += reward
        self.current_episode_reward += reward
        self.current_episode_length += 1

        # save timestep metrics
        self.rewards.append(self.cumulative_reward)
        self.timesteps.append(self.total_timesteps)

        if done: # end of episode

            # save episode metrics
            self.episode_rewards.append(self.current_episode_reward)
            self.episode_number.append(self.current_episode)
            self.episode_lengths.append(self.current_episode_length)

            # reset for next episode
            self.current_episode_reward = 0
            self.current_episode_length = 0
            self.current_episode += 1

        return True


In [5]:
## Stage 1
def stage1_reward(env):
    reward = 0.01  # time alive bonus

    # reward distance to the ball
    reward += 0.1 * (1.0 - abs(env.paddle_y - env.ball_y))
    
    done = False
    
    ## ball hits top or bottom 
    if env.ball_y <= 0.0 or env.ball_y >= 1.0:
        env.ball_vy *= -1

    ## ball goes out of bounds on the right
    if env.ball_x >= 1.0:
        
        # paddle hits ball!
        if abs(env.ball_y - env.paddle_y) < env.paddle_size / 2:
            reward += 1.0
        
        # paddle misses ball, episode ends
        else:
            reward -= 1
            done = True

            # reset environment to initial difficulty levels
            env.setDifficulty(ball_vx=0, ball_vy=0.2, paddle_size=0.5, ball_x=1.0, ball_y=0.5, paddle_speed=0.3)

    return reward, done

def stage1(cycle_number):
    # initialize new environment and set stage 1 difficulty 
    env1 = CustomPongEnv(reward_fn=stage1_reward)
    callback_1 = RewardCallback()
    env1.setDifficulty(ball_vx=0, ball_vy=0.2, paddle_size=0.5, ball_x=1.0, ball_y=0.5, paddle_speed=0.3)

    # initialize new PPO model and start learning
    model = PPO("MlpPolicy", make_vec_env(lambda: env1, n_envs=1), verbose=1)
    model.learn(total_timesteps=50000, callback=callback_1)

    plt.figure(figsize=(20, 5))

    # Plot 1: Cumulative Reward vs Timesteps
    plt.subplot(1, 3, 1)
    plt.plot(callback_1.timesteps, callback_1.rewards)
    plt.xlabel("Timesteps")
    plt.ylabel("Cumulative Reward")
    plt.title("Cumulative Reward vs Timesteps")
    plt.grid(True)


    # Plot 2: Episode Reward vs Episode Number
    def moving_avg(data, window=500):
        # makes plots smoother
        return np.convolve(data, np.ones(window) / window, mode='valid')

    smoothed_rewards = moving_avg(callback_1.episode_rewards)
    smoothed_episodes = range(len(smoothed_rewards))  # optionally shift to align

    plt.subplot(1, 3, 2)
    step = 100  
    plt.plot(smoothed_episodes[::step], smoothed_rewards[::step], markersize=2)
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Total Reward per Episode")
    plt.grid(True)

    # Plot 3: Normalized Episode Reward vs Episode Number
    normalized_rewards = []
    for i in range(len(callback_1.episode_rewards)):
        length = callback_1.episode_lengths[i]
        reward = callback_1.episode_rewards[i]
        normalized = reward / length if length > 0 else 0
        normalized_rewards.append(normalized)

    smoothed_normalized = moving_avg(normalized_rewards)
    smoothed_episode_numbers = callback_1.episode_number[len(callback_1.episode_number) - len(smoothed_normalized):]

    plt.subplot(1, 3, 3)
    plt.plot(smoothed_episode_numbers[::step], smoothed_normalized[::step], markersize=2)
    plt.xlabel("Episode")
    plt.ylabel("Reward Averaged over Timesteps")
    plt.title("Normalized Reward per Episode")
    plt.grid(True)
    save_path = f"Stage 1- {cycle_number}.png"
    plt.savefig(save_path)


    return model

In [6]:
# Stage 2
def stage2_reward(env):
    reward = 0.01  # time alive bonus

    # reward distance to the ball
    dist = math.sqrt((env.paddle_y - env.ball_y) ** 2 + (1.0 - env.ball_x) ** 2)
    max_dist = math.sqrt((1.0 - 0.0)**2 + (1.0 - 0.0)**2)  
    normalized_dist = dist / max_dist
    normalized_dist = np.clip(normalized_dist, 0.0, 1.0)
    reward += 0.1 * (1.0 - normalized_dist)
    
    done = False
    
    # ball hits top or bottom
    if env.ball_y <= 0.0 or env.ball_y >= 1.0:
        env.ball_vy *= -1
        env.ball_vx += np.random.uniform(-0.002, 0.002)  

    # ball goes out of bounds on the right
    if env.ball_x >= 1.0:

        # paddle succesfully hits the ball back
        if abs(env.ball_y - env.paddle_y) < env.paddle_size / 2:
            reward += 1.0
            env.ball_vx *= -1

            # add random noise to the ball's velocity to introduce variability
            env.ball_vx += np.random.uniform(-0.002, 0.002)  
            env.ball_vy += np.random.uniform(-0.002, 0.002)  

        # paddle misses ball, episode ends
        else:
            reward -= 1.0
            done = True

            # reset environment to initial stage 4 difficulty levels
            env.setDifficulty(ball_vx=0.2, ball_vy=np.random.choice([-0.2, -0.1, 0.1, 0.2]), paddle_size=0.5, ball_x=0.5, ball_y=0.5, paddle_speed=0.3)

    # ball hits left wall
    if env.ball_x <= 0.0:
        env.ball_vx *= -1
        env.ball_vy += np.random.uniform(-0.002, 0.002)  

    return reward, done

def stage2(cycle_number, model):
    # initialize new environment and set stage 2 difficulty 
    callback_2 = RewardCallback()
    env2 = CustomPongEnv(reward_fn=stage2_reward)
    env2.setDifficulty(ball_vx=0.2, ball_vy=0.2, paddle_size=0.5, ball_x=0.5, ball_y=0.5, paddle_speed=0.3)
    
    # use stage 1 ppo model to continue learning on environment 2
    model.set_env(make_vec_env(lambda: env2, n_envs=1))
    model.learn(total_timesteps=100000, callback=callback_2)

    plt.figure(figsize=(20, 5))

    # Plot 1: Cumulative Reward vs Timesteps
    plt.subplot(1, 3, 1)
    plt.plot(callback_2.timesteps, callback_2.rewards)
    plt.xlabel("Timesteps")
    plt.ylabel("Cumulative Reward")
    plt.title("Cumulative Reward vs Timesteps")
    plt.grid(True)

    # Plot 2: Episode Reward vs Episode Number
    def moving_avg(data, window=500):
        # makes plots smoother
        return np.convolve(data, np.ones(window) / window, mode='valid')

    smoothed_rewards = moving_avg(callback_2.episode_rewards)
    smoothed_episodes = range(len(smoothed_rewards)) 

    plt.subplot(1, 3, 2)
    step = 100 # plot every 100 steps
    plt.plot(smoothed_episodes[::step], smoothed_rewards[::step], markersize=2)
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Total Reward per Episode")
    plt.grid(True)

    # Plot 3: Normalized Episode Reward vs Episode Number
    normalized_rewards = []
    for i in range(len(callback_2.episode_rewards)):
        length = callback_2.episode_lengths[i]
        reward = callback_2.episode_rewards[i]
        normalized = reward / length if length > 0 else 0
        normalized_rewards.append(normalized)

    smoothed_normalized = moving_avg(normalized_rewards)
    smoothed_episode_numbers = callback_2.episode_number[len(callback_2.episode_number) - len(smoothed_normalized):]

    plt.subplot(1, 3, 3)
    plt.plot(smoothed_episode_numbers[::step], smoothed_normalized[::step], markersize=2)
    plt.xlabel("Episode")
    plt.ylabel("Reward Averaged over Timesteps")
    plt.title("Normalized Reward per Episode")
    plt.grid(True)

    save_path = f"Stage 2- {cycle_number}.png"
    plt.savefig(save_path)

    return model

In [7]:
# Stage 3
def stage3_reward(env):
    reward = 0.01  # time alive bonus

    # reward distance to the ball
    dist = math.sqrt((env.paddle_y - env.ball_y) ** 2 + (1.0 - env.ball_x) ** 2)
    max_dist = math.sqrt((1.0 - 0.0)**2 + (1.0 - 0.0)**2)  
    normalized_dist = dist / max_dist
    normalized_dist = np.clip(normalized_dist, 0.0, 1.0)
    reward += 0.1 * (1.0 - normalized_dist)
    
    done = False
    # ball hits top or bottom
    if env.ball_y <= 0.0 or env.ball_y >= 1.0:
        env.ball_vy *= -1
        env.ball_vx += np.random.uniform(-0.002, 0.002)  

    # ball goes out of bounds on the right
    if env.ball_x >= 1.0:

        # paddle succesfully hits ball back
        if abs(env.ball_y - env.paddle_y) < env.paddle_size / 2:
            reward += 0.2
            env.ball_vx *= -1
            
            # add random noise to the ball's velocity to introduce variability
            env.ball_vx += np.random.uniform(-0.002, 0.002)  
            env.ball_vy += np.random.uniform(-0.002, 0.002)  

        # paddle misses ball, episode ends
        else:
            reward -= 0.2
            done = True

            # reset environment to initial stage 3 difficulty levels
            env.setDifficulty(ball_vx=np.random.choice([-0.2, -0.1, 0.1, 0.2]), ball_vy=np.random.choice([-0.2, -0.1, 0.1, 0.2]), paddle_size=0.5, ball_x=0.5, ball_y=0.5, paddle_speed=0.3)

    # ball hits left wall
    if env.ball_x <= 0.0:

        env.ball_vx *= -1
        env.ball_vy += np.random.uniform(-0.002, 0.002)  

    return reward, done

def stage3(cycle_number, model):
    # initialize new environment and set stage 3 difficulty 
    callback_3 = RewardCallback()
    env3 = CustomPongEnv(reward_fn=stage3_reward)
    env3.setDifficulty(ball_vx=0.2, ball_vy=0.2, paddle_size=0.5, ball_x=0.5, ball_y=0.5, paddle_speed=0.3)

    # use stage 2 ppo model to continue learning on environment 3
    model.set_env(make_vec_env(lambda: env3, n_envs=1))
    model.learn(total_timesteps=150000, callback=callback_3)

    plt.figure(figsize=(20, 5))

    # Plot 1: Cumulative Reward vs Timesteps
    plt.subplot(1, 3, 1)
    plt.plot(callback_3.timesteps, callback_3.rewards)
    plt.xlabel("Timesteps")
    plt.ylabel("Cumulative Reward")
    plt.title("Cumulative Reward vs Timesteps")
    plt.grid(True)

    # Plot 2: Episode Reward vs Episode Number
    def moving_avg(data, window=50):
        # makes plots smoother
        return np.convolve(data, np.ones(window) / window, mode='valid')

    smoothed_rewards = moving_avg(callback_3.episode_rewards)
    smoothed_episodes = range(len(smoothed_rewards)) 

    plt.subplot(1, 3, 2)
    step = 25  # plot every 25 steps
    plt.plot(smoothed_episodes[::step], smoothed_rewards[::step], markersize=2)
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Total Reward per Episode")
    plt.grid(True)

    # Plot 3: Normalized Episode Reward vs Episode Number
    normalized_rewards = []
    for i in range(len(callback_3.episode_rewards)):
        length = callback_3.episode_lengths[i]
        reward = callback_3.episode_rewards[i]
        normalized = reward / length if length > 0 else 0
        normalized_rewards.append(normalized)

    smoothed_normalized = moving_avg(normalized_rewards)
    smoothed_episode_numbers = callback_3.episode_number[len(callback_3.episode_number) - len(smoothed_normalized):]

    plt.subplot(1, 3, 3)
    plt.plot(smoothed_episode_numbers[::step], smoothed_normalized[::step], markersize=2)
    plt.xlabel("Episode")
    plt.ylabel("Reward Averaged over Timesteps")
    plt.title("Normalized Reward per Episode")
    plt.grid(True)
    save_path = f"Stage 3- {cycle_number}.png"
    plt.savefig(save_path)

    return model

In [8]:
# Stage 4
def stage4_reward(env):
    reward = 0.01  # time alive bonus

    # reward distance to the ball
    dist = math.sqrt((env.paddle_y - env.ball_y) ** 2 + (1.0 - env.ball_x) ** 2)
    max_dist = math.sqrt((1.0 - 0.0)**2 + (1.0 - 0.0)**2)  
    normalized_dist = dist / max_dist
    normalized_dist = np.clip(normalized_dist, 0.0, 1.0)
    reward += 0.1 * (1.0 - normalized_dist)
    
    done = False

    # ball hits top or bottom
    if env.ball_y <= 0.0 or env.ball_y >= 1.0:
        env.ball_vy *= -1
        env.ball_vx += np.random.uniform(-0.002, 0.002)  

    # ball goes out of bounds on the right
    if env.ball_x >= 1.0:
        
        # paddle succesfully hits ball back
        if abs(env.ball_y - env.paddle_y) < env.paddle_size / 2:
            reward += 0.2
            env.ball_vx *= -1

            # add random noise to the ball's velocity to introduce variability
            env.ball_vx += np.random.uniform(-0.002, 0.002)  
            env.ball_vy += np.random.uniform(-0.002, 0.002) 
             
        # paddle misses ball, episode ends            
        else:
            reward -= 0.2
            done = True

            # reset environment to initial stage 4 difficulty levels
            env.setDifficulty(ball_vx=np.random.choice([-0.2, -0.1, 0.1, 0.2]), ball_vy=np.random.choice([-0.2, -0.1, 0.1, 0.2]), paddle_size=0.5, ball_x=np.random.choice([ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), ball_y=np.random.choice([ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), paddle_speed=0.3)

    # ball hits left wall
    if env.ball_x <= 0.0:
        env.ball_vx *= -1
        env.ball_vy += np.random.uniform(-0.002, 0.002)  

    return reward, done

def stage4(cycle_number, model):
    # initialize new environment and set stage 4 difficulty 
    callback_4 = RewardCallback()
    env4 = CustomPongEnv(reward_fn=stage4_reward)
    env4.setDifficulty(ball_vx=0.2, ball_vy=0.2, paddle_size=0.5, ball_x=0.5, ball_y=0.5, paddle_speed=0.3)

    # use stage 3 ppo model to continue learning on environment 4 
    model.set_env(make_vec_env(lambda: env4, n_envs=1))
    model.learn(total_timesteps=150000, callback=callback_4)

    plt.figure(figsize=(20, 5))

    # Plot 1: Cumulative Reward vs Timesteps
    plt.subplot(1, 3, 1)
    plt.plot(callback_4.timesteps, callback_4.rewards)
    plt.xlabel("Timesteps")
    plt.ylabel("Cumulative Reward")
    plt.title("Cumulative Reward vs Timesteps")
    plt.grid(True)

    # Plot 2: Episode Reward vs Episode Number
    def moving_avg(data, window=50):
        # makes plots smoother
        return np.convolve(data, np.ones(window) / window, mode='valid')

    smoothed_rewards = moving_avg(callback_4.episode_rewards)
    smoothed_episodes = range(len(smoothed_rewards)) 
    plt.subplot(1, 3, 2)
    step = 25  # plot every 25 steps
    plt.plot(smoothed_episodes[::step], smoothed_rewards[::step], markersize=2)
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Total Reward per Episode")
    plt.grid(True)

    # Plot 3: Normalized Episode Reward vs Episode Number
    normalized_rewards = []
    for i in range(len(callback_4.episode_rewards)):
        length = callback_4.episode_lengths[i]
        reward = callback_4.episode_rewards[i]
        normalized = reward / length if length > 0 else 0
        normalized_rewards.append(normalized)

    smoothed_normalized = moving_avg(normalized_rewards)
    smoothed_episode_numbers = callback_4.episode_number[len(callback_4.episode_number) - len(smoothed_normalized):]

    plt.subplot(1, 3, 3)
    plt.plot(smoothed_episode_numbers[::step], smoothed_normalized[::step], markersize=2)
    plt.xlabel("Episode")
    plt.ylabel("Reward Averaged over Timesteps")
    plt.title("Normalized Reward per Episode")
    plt.grid(True)
    save_path = f"Stage 4- {cycle_number}.png"
    plt.savefig(save_path)
    
    return model

In [9]:
# Stage 5
def stage5_reward(env):
    reward = 0.01  # time alive bonus

    # reward distance to the ball
    dist = math.sqrt((env.paddle_y - env.ball_y) ** 2 + (1.0 - env.ball_x) ** 2)
    max_dist = math.sqrt((1.0 - 0.0)**2 + (1.0 - 0.0)**2)  # ~1.414
    normalized_dist = dist / max_dist
    normalized_dist = np.clip(normalized_dist, 0.0, 1.0)
    reward += 0.1 * (1.0 - normalized_dist)
    
    done = False
    
    # ball hits top or bottom
    if env.ball_y <= 0.0 or env.ball_y >= 1.0:
        env.ball_vy *= -1
        env.ball_vx += np.random.uniform(-0.002, 0.002)  

     # ball goes out of bounds on the right
    if env.ball_x >= 1.0:
        
        # paddle succesfully hits ball back
        if abs(env.ball_y - env.paddle_y) < env.paddle_size / 2:
            reward += 0.2
            env.ball_vx *= -1

            # add random noise to the ball's velocity to introduce variability
            env.ball_vx += np.random.uniform(-0.002, 0.002)  
            env.ball_vy += np.random.uniform(-0.002, 0.002)  
        
        # paddle misses ball, episode ends       
        else:
            reward -= 0.2
            done = True

            # every 100 episodes, decrease paddle size by 0.05
            if (env.episode_count % 100 == 0):
                env.paddle_size -= 0.05

            # reset environment to initial stage 5 difficulty levels
            env.setDifficulty(ball_vx=np.random.choice([-0.2, -0.1, 0.1, 0.2]), ball_vy=np.random.choice([-0.2, -0.1, 0.1, 0.2]), paddle_size=env.paddle_size, ball_x=np.random.choice([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), ball_y=np.random.choice([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), paddle_speed=0.3)

    # ball hits left wall
    if env.ball_x <= 0.0:
        env.ball_vx *= -1
        env.ball_vy += np.random.uniform(-0.002, 0.002)  

    return reward, done

def stage5(cycle_number, model):
    # initialize new environment and set stage 5 difficulty 
    callback_5 = RewardCallback()
    env5 = CustomPongEnv(reward_fn=stage5_reward)
    env5.setDifficulty(ball_vx=0.2, ball_vy=0.2, paddle_size=0.5, ball_x=0.5, ball_y=0.5, paddle_speed=0.3)

    # use stage 4 ppo model to continue learning on environment 5
    model.set_env(make_vec_env(lambda: env5, n_envs=1))
    model.learn(total_timesteps=150000, callback=callback_5)

    plt.figure(figsize=(20, 5))

    # Plot 1: Cumulative Reward vs Timesteps
    plt.subplot(1, 3, 1)
    plt.plot(callback_5.timesteps, callback_5.rewards)
    plt.xlabel("Timesteps")
    plt.ylabel("Cumulative Reward")
    plt.title("Cumulative Reward vs Timesteps")
    plt.grid(True)

    # Plot 2: Episode Reward vs Episode Number
    def moving_avg(data, window=20):
        # makes plots smoother
        return np.convolve(data, np.ones(window) / window, mode='valid')

    smoothed_rewards = moving_avg(callback_5.episode_rewards)
    smoothed_episodes = range(len(smoothed_rewards)) 

    plt.subplot(1, 3, 2)
    step = 25  # plot every 25 steps
    plt.plot(smoothed_episodes[::step], smoothed_rewards[::step], markersize=2)
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Total Reward per Episode")
    plt.grid(True)

    # Plot 3: Normalized Episode Reward vs Episode Number
    normalized_rewards = []
    for i in range(len(callback_5.episode_rewards)):
        length = callback_5.episode_lengths[i]
        reward = callback_5.episode_rewards[i]
        normalized = reward / length if length > 0 else 0
        normalized_rewards.append(normalized)

    smoothed_normalized = moving_avg(normalized_rewards)
    smoothed_episode_numbers = callback_5.episode_number[len(callback_5.episode_number) - len(smoothed_normalized):]

    plt.subplot(1, 3, 3)
    plt.plot(smoothed_episode_numbers[::step], smoothed_normalized[::step], markersize=2)
    plt.xlabel("Episode")
    plt.ylabel("Reward Averaged over Timesteps")
    plt.title("Normalized Reward per Episode")
    plt.grid(True)

    save_path = f"Stage 5- {cycle_number}.png"
    plt.savefig(save_path)

    return model

In [None]:
# run stages in order times to get best plots
for i in range(1):
    model = stage1(i)
    model = stage2(i, model)
    model = stage3(i, model)
    model = stage4(i, model)
    model = stage5(i, model)

PPO Only

In [None]:
def ppo_reward(env):
    reward = 0 
    
    done = False
    
    # ball hits top or bottom
    if env.ball_y <= 0.0 or env.ball_y >= 1.0:
        env.ball_vy *= -1
        env.ball_vx += np.random.uniform(-0.002, 0.002)  

    # ball goes out of bounds on the right
    if env.ball_x >= 1.0:

        # paddle succesfully hits ball back
        if abs(env.ball_y - env.paddle_y) < env.paddle_size / 2:
            reward += 1.0
            env.ball_vx *= -1

            # add random noise to the ball's velocity to introduce variability
            env.ball_vx += np.random.uniform(-0.002, 0.002)  
            env.ball_vy += np.random.uniform(-0.002, 0.002)  

        # paddle misses ball, episode ends
        else:
            reward -= 1.0
            done = True

            # reset environment to initial difficulty levels
            env.setDifficulty(ball_vx=np.random.choice([-0.2, -0.1, 0.1, 0.2]), ball_vy=np.random.choice([-0.2, -0.1, 0.1, 0.2]), paddle_size=0.3, ball_x=np.random.choice([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), ball_y=np.random.choice([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), paddle_speed=0.3)

    # ball hits left wall
    if env.ball_x <= 0.0:
        env.ball_vx *= -1
        env.ball_vy += np.random.uniform(-0.002, 0.002)  

    return reward, done


# initialize new pong environment and set difficulty
ppo_callback = RewardCallback()
ppo_env = CustomPongEnv(reward_fn=ppo_reward)
ppo_env.setDifficulty(ball_vx=0.2, ball_vy=0.2, paddle_size=0.3, ball_x=0.5, ball_y=0.5, paddle_speed=0.3)

# initialize new PPO model and learn for 600,000 timesteps
model = PPO("MlpPolicy", make_vec_env(lambda: ppo_env, n_envs=1), verbose=1)
model.learn(total_timesteps=600000, callback=ppo_callback)

plt.figure(figsize=(20, 5))

# Plot 1: Cumulative Reward vs Timesteps
plt.subplot(1, 3, 1)
plt.plot(ppo_callback.timesteps, ppo_callback.rewards)
plt.xlabel("Timesteps")
plt.ylabel("Cumulative Reward")
plt.title("Cumulative Reward vs Timesteps")
plt.grid(True)

# Plot 2: Episode Reward vs Episode Number
def moving_avg(data, window=500):
    # makes plots smoother
    return np.convolve(data, np.ones(window) / window, mode='valid')

smoothed_rewards = moving_avg(ppo_callback.episode_rewards)
smoothed_episodes = range(len(smoothed_rewards))  

plt.subplot(1, 3, 2)
step = 100 
plt.plot(smoothed_episodes[::step], smoothed_rewards[::step], markersize=2)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Total Reward per Episode")
plt.grid(True)

# Plot 3: Normalized Episode Reward vs Episode Number
normalized_rewards = []
for i in range(len(ppo_callback.episode_rewards)):
    length = ppo_callback.episode_lengths[i]
    reward = ppo_callback.episode_rewards[i]
    normalized = reward / length if length > 0 else 0
    normalized_rewards.append(normalized)

smoothed_normalized = moving_avg(normalized_rewards)
smoothed_episode_numbers = ppo_callback.episode_number[len(ppo_callback.episode_number) - len(smoothed_normalized):]

plt.subplot(1, 3, 3)
plt.plot(smoothed_episode_numbers[::step], smoothed_normalized[::step], markersize=2)
plt.xlabel("Episode")
plt.ylabel("Reward Averaged over Timesteps")
plt.title("Normalized Reward per Episode")
plt.grid(True)