In [None]:
import gym
import numpy as np
import time
import matplotlib.pyplot as plt
import time
import random
from envs.test_env import Electric_Car

class TabularQLearning():
    def __init__(self, data_path, discount_rate = 0.95, bin_size = 100):
        
        '''
        Params:
        discount_rate = discount rate used for future rewards
        bin_size = number of bins used for discretizing the state space
        
        '''
        
        self.discount_rate = discount_rate
        self.bin_size = bin_size
        self.env = Electric_Car(path_to_test_data=data_path)
        self.action_space = self.env.continuous_action_space
        
        self.low = self.env.observation_space.low
        self.high = self.env.observation_space.high
    
        self.bins_battery = np.linspace(self.low[0], self.high[0], self.bin_size) 
        self.bins_price = np.linspace(self.low[1], self.high[1], self.bin_size) 
        
        self.bins = [self.bins_battery, self.bins_price]
    
    def discretize_state(self, state, cont_features = [0,1]):
        
        '''
        Params:
        state = state observation that needs to be discretized
        
        Returns:
        discretized state
        '''
        for feature in cont_features:
        
        self.state = state
        digitized_state = []
        
        for i in range(len(self.bins)):
            digitized_state.append(np.digitize(self.state[i], self.bins[i])-1)
        
        
        return digitized_state
    
    def create_Q_table(self):
        '''
        Returns:
        Q-table with zeros
        '''
        
        self.state_space = self.bin_size - 1
        self.Qtable = np.zeros((self.state_space, self.state_space, self.action_space))
        

    def train(self, simulations, learning_rate, epsilon = 0.05, epsilon_decay = 1000, adaptive_epsilon = False, 
              adapting_learning_rate = False):
        
        '''
        Params:
        
        simulations = number of episodes of a game to run
        learning_rate = learning rate for the update equation
        epsilon = epsilon value for epsilon-greedy algorithm
        epsilon_decay = number of full episodes (games) over which the epsilon value will decay to its final value
        adaptive_epsilon = boolean that indicates if the epsilon rate will decay over time or not
        adapting_learning_rate = boolean that indicates if the learning rate should be adaptive or not
        
        '''
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.learning_rate = learning_rate
        self.epsilon_start = 1
        self.epsilon_end = 0.05

        self.rewards = []
        self.average_rewards = []
        self.create_Q_table()
        
        if adapting_learning_rate:
            self.learning_rate = 1
        
        for i in range(simulations):
            #if i % 5000 == 0:
            #    print(f'Please wait, the algorithm is learning! The current simulation is {i}')
            
            done = False
            
            state = self.env.reset()[0]
            state = self.discretize_state(state)
            
            total_rewards = 0
            
            if adaptive_epsilon:
                self.epsilon = np.interp(i, [0, self.epsilon_decay], [self.epsilon_start, self.epsilon_end])

            while not done:

                if np.random.uniform(0,1) > 1-self.epsilon:
                    action = self.env.action_space.sample()
                else:
                    action = np.argmax(self.Qtable[state[0],state[1],:])
                    
                next_state, reward, terminated, truncated, info = self.env.step(action)
                done =  terminated or truncated
                
                next_state = self.discretize_state(next_state)

                Q_target = (reward + self.discount_rate*np.max(self.Qtable[next_state[0], next_state[1]]))
                delta = self.learning_rate * (Q_target - self.Qtable[state[0], state[1], action])
                self.Qtable[state[0], state[1], action] = self.Qtable[state[0], state[1], action] + delta
                
                total_rewards += reward
                state = next_state      
            
            if adapting_learning_rate:
                self.learning_rate = self.learning_rate/np.sqrt(i+1)
            
            self.rewards.append(total_rewards)
            
            #Calculate the average score over 100 episodes
            if i % 100 == 0:
                self.average_rewards.append(np.mean(self.rewards))
                
                #Initialize a new reward list, as otherwise the average values would reflect all rewards!
                self.rewards = []
        
        print('The simulation is done!')
        
    def visualize_rewards(self):
        pass
            
    def play_game(self):
        pass
