In [None]:
from collections import deque
import matplotlib.pyplot as plt
import random
import numpy as np
import sys
import math
import torch
from client import Client, DEFAULT_PORT, JOINTS
from ddpg import DDPG  # Make sure DDPG class is properly imported

TABLE_HEIGHT = 1.0
TABLE_THICKNESS = 0.08
TABLE_LENGTH = 2.4
TABLE_WIDTH = 1.4


class ActionSpace:
    def __init__(self, low, high):
        self.low = np.array(low)
        self.high = np.array(high)
        self.shape = self.low.shape

ACTION_SPACE = ActionSpace(
    #Used Joints: Everything but joint[1]
    #Actionspace: 0,2,3,...,10
    [-0.1, -0.001, -0.7, -0.005, -0.7, -0.01, -0.7, -0.05, -1.4, -0.1],
    [0.5, 0.001, 1.5, 0.005, 0.7, 0.01, 0.7, 0.05, 0.7, 0.1])


def get_neutral_joint_position():
    jp = [0.0]*JOINTS
    jp[0] = -0.2
    jp[2] = math.pi
    a = math.pi/3.8
    jp[3] = 0.1
    jp[5] = a
    jp[7] = a
    jp[9] = math.pi/3.5
    jp[10] = math.pi/2
    return jp


class OrnsteinUhlenbeckNoise:
    def __init__(self, size, mu=0.0, theta=0.15, sigma=0.2):
        self.size = size
        self.mu = mu #mean of noise. Initially 0.0
        self.theta = theta #decline of exploration. Initially 0.15
        self.sigma = sigma #exploraion. Initialy 0.2
        self.state = np.ones(self.size) * self.mu

    def reset(self):
        self.state = np.ones(self.size) * self.mu

    def noise(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.size)
        self.state = x + dx
        return self.state




gamma = 0.7
tau = 0.1
hidden_size = [256, 256]
num_inputs = 38
#Initialize Actor
actor = DDPG(gamma=gamma, tau=tau, hidden_size=hidden_size, num_inputs=num_inputs, action_space=ACTION_SPACE)
actor.load_checkpoint()
def run(cli):


    # Initialize Replay Buffer
    buffer_size = 10000  # Maximum size of the replay buffer
    replay_buffer = deque(maxlen=buffer_size)

    max_episodes = 5000  # Number of episodes to train
    max_steps = 400  # Maximum number of steps per episode
    episode_rewards = []  # To store rewards for each episode

    # Assume action_space is an instance of ActionSpace with appropriate low and high bounds
    action_dim = ACTION_SPACE.low.shape[0]
    noise = OrnsteinUhlenbeckNoise(size=action_dim)
    for episode in range(max_episodes):
        episode_reward = 0
        state = cli.get_state()
        state = torch.FloatTensor(state).unsqueeze(0)  # Convert to PyTorch tensor

        for step in range(max_steps):
            reward=0
            noise.reset()
            # Calculate action
            action = actor.calc_action(state,noise)
            act_action = action.cpu().numpy()[0]
            act_action = calc_action(act_action, state[0][17])

            # Perform action in environment
            cli.send_joints(act_action)

            # Receive next state and reward from environment
            next_state = cli.get_state()
            next_state = torch.FloatTensor(next_state).unsqueeze(0)

            reward = get_reward(next_state[0],state[0])  #  function to get reward from environment

            # Store transition in replay buffer
            transition = (state, action,reward, next_state)
            replay_buffer.append(transition)

            #end episode, if point scored
            if state[0][35] < next_state[0][35] or state[0][34] < next_state[0][34]:
                break

            # Update state
            state = next_state
            episode_reward += reward

            # Perform DDPG updates if replay buffer is sufficiently large
            if len(replay_buffer) > 100:
                #print(f"Training...")
                batch_size = 32
                batch = random.sample(replay_buffer, batch_size)
                # Perform DDPG updates
                value_loss, policy_loss = actor.update_params(batch)


        #here im thinking about clearing the buffer, so only new values are in there
        #replay_buffer.clear()

        # Print episode results
        print(f"Episode {episode + 1}: Reward = {episode_reward}")
        episode_rewards.append(episode_reward)

        # Optionally, save checkpoint
        if (episode + 1) % 500 == 0:
            actor.save_checkpoint(episode + 1, replay_buffer)
            #resetting the noise
            noise = OrnsteinUhlenbeckNoise(size=action_dim)

    # Optionally, plot episode rewards or perform other analyses
    # ...

    # Finally, save the last checkpoint
    actor.save_checkpoint(max_episodes, replay_buffer)


def get_reward(states,old_states):
    versor=(states[14]-0.02) **2 + (states[15]-0.90) ** 2 + (states[16]-0.43) ** 2
    #calculate pos, but without the Y
    posnY = (states[11] - states[17]) ** 2 + 0 * (states[12] - states[18]) ** 2 + (states[13] - states[19]) ** 2
    pos = (states[11] - states[17]) ** 2 + (states[12] - states[18]) ** 2 + (states[13] - states[19]) ** 2
    bonus = 0
    if states[13] < 0: bonus += 5
    if pos < 0.05 and states[31] < 1: #if ball is hitting paddle and didn't hit robot before
        bonus -= 10
    # if ball enters enemy area above plate, but not at game-start
    if old_states[32] < states[32] and states[19] > 0 and states[22] != 0:
        bonus -= 60
    if old_states[33] < states[33] and states[22] != 0:  # if ball hits enemy plate, but not at game-start
        bonus -= 80
        print("Crazy! a Hit!")
    if old_states[34] < states[34]:
        bonus -= 120
        print("Crazy! a Point!")
    # MORE IDEAS FOR BONUS:
    # Bonus for good speed after hit
    # Bonus adds up if its the second ball hit
    #print(f"Pos: {2*posnY}, Versor: {2*versor}")
    #return reward, if game is still playing
    if states[28]:
        return -( 2 * versor + 2.5 * posnY + bonus)
    else:
        return torch.tensor(0.0)



def calc_action(action, y): #uses the standard position and adds changes
    a= get_neutral_joint_position()
    # Nets give a value between -1 and 1
    # Normalize values
    action[0] = action[0] * 0.3 + 0.2
    action[1] = action[1] * 0.001
    action[2] = action[2] * 1.1 + 0.4
    action[3] = action[3] * 0.005
    action[4] = action[4] * 0.7
    action[5] = action[5] * 0.01
    action[6] = action[6] * 0.7
    action[7] = action[7] * 0.05
    action[8] = action[8] * 1.05 - 0.35
    action[9] = action[9] * 0.1

    #mapping values on the neutral joints
    a[1]= y
    a[0]= a[0] + action[0]
    for i in range(len(action)-1):
        a[i+2]= a[i+2] + action[i+1]
    return a




def main():
    name = 'Product Client'
    if len(sys.argv) > 1:
        name = sys.argv[1]

    port = DEFAULT_PORT
    if len(sys.argv) > 2:
        port = sys.argv[2]

    host = 'localhost'
    if len(sys.argv) > 3:
        host = sys.argv[3]

    cli = Client(name, host, port)
    run(cli)



if __name__ == '__main__':
    main()
