Starter.py content

In [20]:
import sys

import pylab as plb
import numpy as np
import mountaincar

In [21]:
# FROM Project 1!

# To calculate the gaussian
def gauss(x,p):
    """Return the gauss function N(x), with mean p[0] and std p[1].
    Normalized such that N(x=p[0]) = 1.
    """
    return np.exp((-(x - p[0])**2) / (2 * p[1]**2))

# To calculate the distance between two vectors
def distance(vec1,vec2):
    assert np.shape(vec1)==np.shape(vec2)
    #print("   Distance between",vec1,"and",vec2)
    term1 = np.power(vec1-vec2,2)
    #print("      term1 :",term1)
    term2 = np.sqrt(np.sum(term1,axis=1))
    #print("      term2 :",term2)
    return term2

# To assign data to the closest center
def assign_cluster(centers, datas):
    #initialization
    cluster_assignment = np.zeros(np.shape(datas)[0]) - 1
    current_c = np.tile(centers[0], (np.shape(datas)[0],1))
    eucl_distance = distance(current_c,datas)
    min_distance = np.copy(eucl_distance)
    #find cluster for each entry in datas
    for i in range(0,np.shape(centers)[0]):
        #print("Center is : ",centers[i], "iteration : ",i)
        current_c = np.tile(centers[i], (np.shape(datas)[0],1))
        eucl_distance = distance(current_c,datas)
        #print("   MD :",min_distance)
        #print("   ED :",eucl_distance)
        bool_distance = eucl_distance <= min_distance
        #print("   BD :",bool_distance)
        min_distance = eucl_distance*(bool_distance) + min_distance*(1 - bool_distance)
        #print("   MD :",min_distance)        
        cluster_assignment = cluster_assignment*(1 - bool_distance) + i*bool_distance
        #print("   CA :",cluster_assignment)
    return cluster_assignment

In [129]:
class DummyAgent():
    """A not so good agent for the mountain-car task.
    """

    def __init__(self, mountain_car = None, parameter1 = 3.0):
        
        if mountain_car is None:
            self.mountain_car = mountaincar.MountainCar()
        else:
            self.mountain_car = mountain_car

        self.parameter1 = parameter1

    def visualize_trial(self, n_steps = 200):
        """Do a trial without learning, with display.

        Parameters
        ----------
        n_steps -- number of steps to simulate for
        """
        
        # prepare for the visualization
        plb.ion()
        mv = mountaincar.MountainCarViewer(self.mountain_car)
        mv.create_figure(n_steps, n_steps)
        plb.draw()
            
        # make sure the mountain-car is reset
        self.mountain_car.reset()

        for n in range(n_steps):
            print ('\rt =', self.mountain_car.t,
            sys.stdout.flush())
            
            # choose a random action
            self.mountain_car.apply_force(np.random.randint(3) - 1)
            # simulate the timestep
            self.mountain_car.simulate_timesteps(100, 0.01) #MEANS simulation EVERY second of the state of the car !!!

            # update the visualization
            mv.update_figure()
            plb.draw()            
            
            # check for rewards
            if self.mountain_car.R > 0.0:
                print ("\rreward obtained at t = ", self.mountain_car.t)
                break
    
    def learn(self):
        # This is your job!
        
        # Implement Sarsa Lambda: Reinforcement Learning Algo
        # TO DO: http://www.cse.unsw.edu.au/~cs9417ml/RL1/algorithms.html
        #
        # Initialize Q(s,a) arbitrarily
        # Repeat (for each episode)
        #   Initialize s
        #   Choose a from s using policy derived from Q (e.g. Espylon greedy)
        #   Repeat (for each step of episode)
        #     Take action a, observe r, s'
        #     Choose a' from s' using policy derived from Q (e.g. Epsylon greedy)
        #     Q(s,a) <-- Q(s,a) + Alpha(r + Gamma*Q(s',a') - Q(s,a));
        #     s <-- s';      a <-- a';
        #   until s is terminal
        pass

# My work start here

In [137]:
#INITIALIZATION

#Just to import the inner value of the class MountainCar such as x, x_d etc.
mountain_car = mountaincar.MountainCar()

# For the call of Simulate Time Step
dt=0.01; # Time of each step
n_steps=100; # Number of timesteps

#Number of time the trial is repeated
n_trials=10;

#Number of Neurons for the grid (nNeurons X nNeurons)
nNeurons=20;

#Variable for Neuron's Grid Initialization
x_min=-150; #position
x_max=30; #position
xPoint_min=-15; #velocity
xPoint_max=15; #velocity
pos=np.linspace(x_min,x_max,nNeurons); #repartition of neuron through the grid (position)
vel=np.linspace(xPoint_min,xPoint_max,nNeurons); #repartition of neuron through the grid (velocity)
pos_c=np.ones((nNeurons))*pos; #Initialize centers with reparted value (position)
vel_c=np.ones((nNeurons))*vel; #Initialize centers with reparted value (velocity)

#Number of Actions possible at each step: Left, Right or Nothing
Actions=3; 

#Reward Factor (gamma)
Gamma=0.95;

#Eligibility Decay rate (lambda)
Lambda=0.5; #Can be comprised between 0 and 1

#Learning Rate (eta)
Eta=0.1 #LR<<1

#Neuron Activity Matrix
r=np.zeros((nNeurons,nNeurons));

#Width of Gaussian which is equal to the space between each center
sigma_x=abs(abs(x_max)+abs(x_min))/nNeurons; #space between each center on the position axis
sigma_xPoint=abs(abs(xPoint_max)+abs(xPoint_min))/nNeurons; #space between each center on the velocity axis

# Initializing weights and eligibility to zero
w=np.zeros((nNeurons,nNeurons,Actions));    #DIM 20x20x3 
e=np.zeros((nNeurons,nNeurons,Actions));    #DIM 20x20x3 

#Initilizing Q(s,a)
Q=np.zeros((nNeurons,nNeurons,Actions));    #DIM 20x20x3   

#Parameters Tau
Tau=1;

In [141]:
# Recalculate weight 
# AT EACH TIME STEP!
cond=0;

for i in range(0,n_trials):
    # Initializiing e, s and a to 0
    state=np.zeros((nNeurons,nNeurons)); # Dim of STATE: Pos X Vel
    actions=np.zeros((1,Actions)); # Actions = 3 --> "left", "right", "none".
    e=np.zeros((Actions, nNeurons, nNeurons)); # e depend of s and : Actions X Pos X Vel                

    # For each TIME STEP: (of one experiment)
    while(cond<=0):
        mountain_car.simulate_timesteps(n_steps, dt)
        # Computing Activity r_j(s) 
        ################################################################################
        for X in range(0,nNeurons): # To go trhough position
            for XD in range(0,nNeurons): # To go through velocities
                r[X,XD]=np.exp(- pow(((pos_c[X]-mountain_car.x)/sigma_x),2) - pow(((vel_c[XD]-mountain_car.x_d)/sigma_xPoint),2));
        
        for X in range(0,nNeurons): # To go trhough position
            for XD in range(0,nNeurons): # To go through velocities
                for A in range(0,Actions): # To go through actions
                    #Calculte Q!!!
                    Q[X,XD,A]=w[X,XD,A]*r[X,XD]; 
        
        for A in range(0,Actions):
            Q(A)=np.sum(Q[:,:,0],(0,1)); 
            
        SUM_Q=np.sum(Q);
        
        p_test=0;
        for A in range(0,Actions):
            P(A)=np.exp(Q(A)/Tau)/(SUM_Q/Tau);
            if(P(A)>p_test):
                p_test=P(A);
                index=A;
            
        #REDO P(a*=a) in for s' instead of s to calculate Q(s',a')
        
            #Define new State regarding the action we will take
            #Recalculate r_j(s) with "_simulate_single_timestep(self, dt)" 
            #--> get value x and x_d to recalculate Q(s',a') with r_j(s), keep same weight.
            # Créer fonction check_reward (check si x>0) --> return r=0 ou r=1.
        
        #Calcul Delta_t
            #Delta_t=r_(t+1) - [Q(s,a) - Gamma*Q(s',a')]
        
        #Calcul Eligibility
        
        #Update Weights
            #NewWeights=OldWeights+LearningRate*Delta_t*Eligibility
        
        

        #WHILE EXIT CONDITION
        if(mountain_car.x>0):
            cond=1;


########################################################################################################

Sarsa Algo (below) from: https://github.com/studywolf/blog/blob/master/RL/SARSA%20vs%20Qlearn%20cliff/sarsa.py

In [5]:
import random


class Sarsa:
    def __init__(self, actions, epsilon=0.1, alpha=0.2, gamma=0.9):
        self.q = {}

        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.actions = actions

    def getQ(self, state, action):
        return self.q.get((state, action), 0.0)

    def learnQ(self, state, action, reward, value):
        oldv = self.q.get((state, action), None)
        if oldv is None:
            self.q[(state, action)] = reward 
        else:
            self.q[(state, action)] = oldv + self.alpha * (value - oldv)

    def chooseAction(self, state):
        if random.random() < self.epsilon:
            action = random.choice(self.actions)
        else:
            q = [self.getQ(state, a) for a in self.actions]
            maxQ = max(q)
            count = q.count(maxQ)
            if count > 1:
                best = [i for i in range(len(self.actions)) if q[i] == maxQ]
                i = random.choice(best)
            else:
                i = q.index(maxQ)

            action = self.actions[i]
        return action

    def learn(self, state1, action1, reward, state2, action2):
        qnext = self.getQ(state2, action2)
        self.learnQ(state1, action1, reward, reward + self.gamma * qnext)