# Assignment - SARSA Tabular

### Import libraries and relevant files

In [None]:
## IMPORT
import numpy as np
import matplotlib.pyplot as plt
import sys

size_board = 4

In [None]:
from google.colab import drive
drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/UZH/Introduction to Reinforcement Learning/Chessboard/')

Mounted at /content/drive


In [None]:
from degree_freedom_queen import *
from degree_freedom_king1 import *
from degree_freedom_king2 import *
from generate_game import *
from Chess_env import *

## The Environment

You can find the environment in the file Chess_env, which contains the class Chess_env. To define an object, you need to provide the board size considered as input. In our example, size_board=4. 
Chess_env is composed by the following methods:

1. Initialise_game. The method initialises an episode by placing the three pieces considered (Agent's king and queen, enemy's king) in the chess board. The outputs of the method are described below in order.

     self.Board: A matrix representing the board locations filled with 4 numbers: 0, no piece in that position; 1, location of the 
     agent's king; 2 location of the queen; 3 location of the enemy king.
     
     X: The features, that is the input to the neural network. See the assignment for more information regarding the            definition of the features adopted. To personalise this, go into the Features method of the class Chess_env() and change        accordingly.
     
     allowed_a: The allowed actions that the agent can make. The agent is moving a king, with a total number of 8                possible actions, and a queen, with a total number of $(board_{size}-1)\times 8$ actions. The total number of possible actions correspond      to the sum of the two, but not all actions are allowed in a given position (movements to locations outside the borders or      against chess rules). Thus, the variable allowed_a is a vector that is one (zero) for an action that the agent can (can't)      make. Be careful, apply the policy considered on the actions that are allowed only.
     

2. OneStep. The method performs a one step update of the system. Given as input the action selected by the agent, it updates the chess board by performing that action and the response of the enemy king (which is a random allowed action in the settings considered). The first three outputs are the same as for the Initialise_game method, but the variables are computed for the position reached after the update of the system. The fourth and fifth outputs are:

     R: The reward. To change this, look at the OneStep method of the class where the rewards are set.
     
     Done: A variable that is 1 if the episode has ended (checkmate or draw).
     
     
3. Features. Given the chessboard position, the method computes the features.

This information and a quick analysis of the class should be all you need to get going. The other functions that the class exploits are uncommented and constitute an example on how not to write a python code. You can take a look at them if you want, but it is not necessary.






In [None]:
## INITIALISE THE ENVIRONMENT
env = Chess_Env(size_board)

S, X, allowed_a = env.Initialise_game()                       # INTIALISE GAME

### Epsilon-Greedy Policy

In [None]:
## EPSILON-GREEDY POLICY

def EpsilonGreedy_Policy(q_val, epsilon):
    
    N_a = np.shape(q_val)[0]

    rand_value = np.random.uniform(0, 1)    # GENEATE A RANDOM NUMBER FROM THE UNIFORM DIST.

    rand_a = rand_value < epsilon

    if rand_a == True:
        a = np.random.randint(0, N_a)   # SELECTED ACTION (EXPLORE)
    else:
        a = np.argmax(q_val)            # SELECTED ACTION (EXPLOIT)
    
    return a

### Other Functions

In [None]:
## OTHER FUNCTIONS

def checkAllow(allowed_a, q_val):           ## RETURNS THE ALLOWED ACTION INDEX
                                            ## AND THE CORRESPONDING Q VALUE
    q_val_dict = dict()
    for i in range(len(allowed_a)):
        if allowed_a[i][0] == 1:            # EQUALS 1 IF THE ACTION a[i] IS ALLOWED
            q_val_dict[i] = q_val[i]
    q_val_lst = list(q_val_dict.values())   # THE VALUE LISTS OF THE DICTIONARY (CORRESPONDING Q VALUE)
    allowed_lst = list(q_val_dict.keys())   # THE KEY LISTS OF THE DICTIONARY (ALLOWED ACTION INDEX)
    return q_val_dict, q_val_lst, allowed_lst

def checkPosition(S):                       ## RETURNS THE POSITION OF EACH CHESS PIECE (1-16)
    S = list(S.ravel())
    pos_1, pos_2, pos_3 = S.index(1), S.index(2), S.index(3)
    return pos_1, pos_2, pos_3

### Parameters

In [None]:
# HYPERPARAMETERS SUGGESTED (FOR A GRID SIZE OF 4)

epsilon_0 = 0.8       # STARTING VALUE OF EPSILON FOR THE EPSILON-GREEDY POLICY
beta = 0.0005         # THE PARAMETER SETS HOW QUICKLY THE VALUE OF EPSILON IS DECAYING (SEE epsilon_f BELOW)
gamma = 0.01          # THE DISCOUNT FACTOR
eta = 0.1             # THE LEARNING RATE

N_episodes = 120000   # THE NUMBER OF GAMES TO BE PLAYED 
N_s = 16              # FLATTEN STATE DIMENSION
N_a = 32              # Q VALUES DIMENSION

# SAVING VARIABLES
R_save = np.zeros([N_episodes, 1])          # SAVE THE FINAL REWARD IN AN EPISODE
N_moves_save = np.zeros([N_episodes, 1])    # NUMBER OF MOVES
q_table = np.random.randn(N_a, N_s, N_s, N_s)/1000       # INITIALISE Q TABLE

### Main Function

In [None]:
np.random.seed(919)                             ## SET SEED

for n in range(N_episodes):

    epsilon_f = epsilon_0 / (1 + beta * n)
    Done = 0
    total_R = 0
    i = 1
    
    S, X, allowed_a = env.Initialise_game()     # INITIALISE THE ENVIRONMENT
    pos_1, pos_2, pos_3 = checkPosition(S)      # GET THE POSITION OF EACH CHESS
    q_val = np.copy(q_table[:, pos_1, pos_2, pos_3])
    q_val_dict, q_val_lst, allowed_lst = checkAllow(allowed_a, q_val)   # CHECK THE ALLOWED ACTIONS AND ITS INDEX
                                                                        # AND CORRESPONDING Q VALUES
    a = EpsilonGreedy_Policy(q_val_lst, epsilon_f)                      # SELECT AN ACTION BY EPSILON-GREEDY POLICY
    if n % 5000 == 0:
        print(n)                                                        ## CHECK THAT IT IS RUNNING

    while Done == 0:

        S_next, X_next, allowed_a_next, R, Done = env.OneStep(allowed_lst[a])
        pos_1_next, pos_2_next, pos_3_next = checkPosition(S_next)      # GET THE POSTION OF NEXT STEP

        total_R += R        # UPDATE TOTAL REWARD

        ## THE EPISODE HAS ENDED, UPDATE
        if Done == 1:
            # UPDATE Q TABLE
            delta = R - q_table[allowed_lst[a], pos_1, pos_2, pos_3]
            q_table[allowed_lst[a], pos_1, pos_2, pos_3] = q_table[allowed_lst[a], pos_1, pos_2, pos_3] + eta*delta
            break

        else:
            q_val_next = np.copy(q_table[:, pos_1_next, pos_2_next, pos_3_next])
            q_val_dict_next, q_val_lst_next, allowed_lst_next = checkAllow(allowed_a_next, q_val_next)  # CHECK THE ALLOWED ACTIONS AND ITS INDEX
            a_next = EpsilonGreedy_Policy(q_val_lst_next, epsilon_f)    # SELECT AN ACTION BY EPSILON-GREEDY POLICY
            # UPDATE Q TABLE
            delta = R + gamma*q_table[allowed_lst_next[a_next], pos_1_next, pos_2_next, pos_3_next] - q_table[allowed_lst[a], pos_1, pos_2, pos_3]
            q_table[allowed_lst[a], pos_1, pos_2, pos_3] = q_table[allowed_lst[a], pos_1, pos_2, pos_3] + eta*delta
        
        # NEXT STATE AND CO. BECOME ACTUAL STATE
        S = np.copy(S_next)
        X = np.copy(X_next)
        allowed_a = np.copy(allowed_a_next)
        a = a_next
        q_val_dict, q_val_lst, allowed_lst = checkAllow(allowed_a, q_val)
        i += 1  # UPDATE COUNTER FOR NUMBER OF ACTIONS

    R_save[n] = total_R
    N_moves_save[n] = i

### Saving R_save and N_moves_save as csv

In [None]:
from numpy import savetxt
savetxt('/content/drive/MyDrive/UZH/Introduction to Reinforcement Learning/Chessboard/csv_files/table_R_v10.csv', R_save, delimiter=',')
savetxt('/content/drive/MyDrive/UZH/Introduction to Reinforcement Learning/Chessboard/csv_files/table_N_v10.csv', N_moves_save, delimiter=',')