In [42]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import time
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from mpl_toolkits import mplot3d
import time
import random
import pandas as pd
import time
import torch 
import torch.nn as nn
import torch.optim as optim

import torch.nn.functional as F
from collections import deque
from mpl_toolkits import mplot3d

#Set the seed for reproducibility
np.random.seed(10)

def add_features(df):
    df['month'] = df['PRICES'].dt.month - 1
    df['day'] = df['PRICES'].dt.day - 1
    df['year'] = df['PRICES'].dt.year
    df['day_of_week'] = df['PRICES'].dt.dayofweek
    df['day_of_week'] = df['day_of_week'].astype(int)
    df = df.drop(columns=["PRICES"])
    return df

train = pd.read_excel("train.xlsx", parse_dates=['PRICES'])
val = pd.read_excel("validate.xlsx", parse_dates=['PRICES'])

train = add_features(train)
val = add_features(val)

print(train.head(3))  

/kaggle/input/smartgridtrain/train.xlsx
/kaggle/input/smartgridvalidate/validate.xlsx
   Hour 01  Hour 02  Hour 03  Hour 04  Hour 05  Hour 06  Hour 07  Hour 08  \
0    24.31    24.31    21.71     8.42     0.01     0.01     0.02     0.01   
1    16.01    11.00     9.01     7.50     9.00     7.45    16.50    28.01   
2    28.00    24.50    24.15    18.99    15.50    23.11    29.01    39.73   

   Hour 09  Hour 10  ...  Hour 19  Hour 20  Hour 21  Hour 22  Hour 23  \
0     0.01     6.31  ...    37.99    33.11    37.99    33.00    36.48   
1    29.96    39.60  ...    59.69    50.09    50.00    36.22    31.09   
2    43.81    49.09  ...    60.99    55.51    51.77    34.51    39.31   

   Hour 24  month  day  year  day_of_week  
0    30.65      0    0  2007            0  
1    29.84      0    1  2007            1  
2    38.05      0    2  2007            2  

[3 rows x 28 columns]


In [43]:
class SmartGridEnv(gym.Env):
    def __init__(self, price_data, battery_capacity=50, max_power=25, efficiency=0.9):
        super(SmartGridEnv, self).__init__()
        np.random.seed(10)
        
        self.price_data = price_data
        self.battery_capacity = battery_capacity
        self.max_power = max_power
        self.efficiency = efficiency
        self.action_space = gym.spaces.Discrete(51, start=-25) 
        self.time_constraint = 6
        self.battery_time_constraint = 20
        self.initialize_params()
           
        

    def initialize_params(self):
        self.current_step = 0
        self.current_month = 0
        self.current_day = 0
        self.current_hour = 0
        self.day_of_week = int(self.price_data.iloc[self.current_step]['day_of_week'])
        self.current_battery = 20
        self.available = True
        self.current_price = self.price_data.iloc[self.current_step, self.current_hour]
        self.past_prices = np.zeros(24)
        self.done = False
        self.profit = 0 
        self.current_battery_norm = self.current_battery/50
        self.current_hour_norm = self.current_hour/23
        self.past_prices_max = 2500
        self.past_prices_norm = self.past_prices/self.past_prices_max
        self.current_state = tuple([self.current_hour, self.current_battery, self.current_price])
        self.no_features = len(self.current_state)
         
        
    def car_available(self):
        if self.current_hour == 7:
            available = np.random.uniform(0, 1) < 0.5
        else:
            available = True
        return available
    

    def update_state(self):
        # If the car is not available, it returns at 6pm with 20kW less
        if self.available == False:
            self.current_hour = 17
            self.current_battery -= 20
        else:
            self.current_hour = int((self.current_hour + 1) % 24)
            if self.current_hour == 0:
                self.current_step += 1
                
        self.current_month = int(self.price_data.iloc[self.current_step]['month'])
        self.current_day = int(self.price_data.iloc[self.current_step]['day'])
        self.day_of_week = int(self.price_data.iloc[self.current_step]['day_of_week'])
        self.current_price = self.price_data.iloc[self.current_step, self.current_hour]
        
        # Update past day prices
        if self.current_step > 0:
            self.past_prices = np.concatenate([np.array(self.price_data.iloc[self.current_step-1, self.current_hour:24]),np.array(train.iloc[self.current_step,0:self.current_hour])])            
        if (self.current_step+1) % 100 == 0:
            local_max = np.max(self.price_data.iloc[self.current_step-99:self.current_step, 0:24].values)
            if local_max > self.past_prices_max:
                self.past_prices_max = local_max
            
        
        # Update state
        self.current_state = tuple([self.current_hour, self.current_battery,self.current_price])

        # Check if the episode is done (after 3 years of past electricity prices)
        if self.current_step >= len(self.price_data) - 1:
            self.done = True

    def to_discrete(self, action):
        mask = np.zeros(self.action_space.n, dtype=np.int8)
        mask[action] = 1
        return self.action_space.sample(mask = mask)
    
    def reward(self, action, charge_cost):
        reward = -charge_cost 
        return reward
        
    def step(self, action):

        self.available=self.car_available()
        
        # discretize action
        if self.available == True:
            action = self.to_discrete(action) 
        else:
             action = 0
        
        # charge cost
        energy_rate = self.price_data.iloc[self.current_step, self.current_hour]
        charge_cost = action if action < 0 else action* 2 
        charge_cost *= energy_rate / 1000
        
        # update battery
        actual_charge = np.round(self.efficiency * action).astype(int) if action>0 else np.round(action/self.efficiency).astype(int)
        self.current_battery += actual_charge
        
        # update variables
        self.profit += -charge_cost

        # reward
        reward = self.reward(action, charge_cost)
        self.update_state()
        
        return self.current_state, reward, self.done, self.available
    

    def mask(self):
        # 7 am constraint  
        if self.current_hour == self.time_constraint and self.current_battery >= self.battery_time_constraint:
            lower_bound = min(np.ceil((self.battery_time_constraint- self.current_battery)*0.9), self.max_power)
            upper_bound =  min(np.floor((self.battery_capacity-self.current_battery)/0.9), self.max_power)
            mask_range = (lower_bound, upper_bound)
        elif self.current_hour == self.time_constraint and self.current_battery < self.battery_time_constraint:
            lower_bound = min(np.ceil((self.battery_time_constraint-self.current_battery)/0.9), self.max_power)
            upper_bound =  min(np.floor((self.battery_capacity-self.current_battery)/0.9), self.max_power)
            mask_range = (lower_bound, upper_bound)
        else: 
            lower_bound = max(np.ceil(-(self.current_battery)*0.9), -self.max_power)
            upper_bound = min(np.floor((self.battery_capacity - self.current_battery)/0.9), self.max_power)
            mask_range = (lower_bound, upper_bound)
            
        
        # construct boolean mask_vector
        mask = np.arange(self.action_space.start, self.action_space.start + self.action_space.n)
        mask = np.where((mask >= mask_range[0]) & (mask <= mask_range[1]), True, False)
        return mask
    
    def reset(self):
        self.initialize_params()
        return self.current_state
    
    def normalize(self):
        self.current_battery_norm = self.current_battery/51
        self.current_hour_norm = self.current_hour/23
        self.past_prices_norm = self.past_price/self.past_prices_max
        return self.current_battery_norm, self.current_hour_norm, self.past_prices_norm
        
train_env=SmartGridEnv(train)

In [44]:
class DQN(nn.Module):

    def __init__(self, env, learning_rate):
        
        '''
        Params:
        env = environment that the agent needs to play
        learning_rate = learning rate used in the update
        
        '''
        super(DQN,self).__init__()
        input_features =  len(env.current_state)
        action_space = env.action_space.n
        
    
        self.dense1 = nn.Linear(in_features = input_features, out_features = 64)
        self.dense2 = nn.Linear(in_features = 64, out_features = 64)
        self.dense3 = nn.Linear(in_features = 64, out_features = 64)
        self.dense4 = nn.Linear(in_features = 64, out_features = action_space)
        
        #Here we use ADAM, but you could also think of other algorithms such as RMSprob
        self.optimizer = optim.Adam(self.parameters(), lr = learning_rate)
        
    def forward(self, x):
        
        '''
        Params:
        x = observation
        '''        
        x = torch.tanh(self.dense1(x))
        x = torch.tanh(self.dense2(x))
        x = torch.tanh(self.dense3(x))
        x = self.dense4(x)
        
        return x
    

In [45]:
class ExperienceReplay:
    
    def __init__(self, env, buffer_size, min_replay_size = 1000 ):
        
        '''
        Params:
        env = environment that the agent needs to play
        buffer_size = max number of transitions that the experience replay buffer can store
        min_replay_size = min number of (random) transitions that the replay buffer needs to have when initialized
        seed = seed for random number generator for reproducibility
        '''
        self.env = env
        self.min_replay_size = min_replay_size
        self.replay_buffer = deque(maxlen=buffer_size)
        self.reward_buffer = deque([0.0], maxlen = 100)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        print('Please wait, the experience replay buffer will be filled with random transitions')
                
        state = self.env.reset() 
        for _ in range(self.min_replay_size):
            
            mask = self.env.mask()
            action = np.random.choice(np.arange(env.action_space.n)[mask])
            new_state, reward, done, available = env.step(action) #step takes in an action from -25 to 25
            
            new_state_mask = self.env.mask()
            transition = (state, action, reward, done, new_state, new_state_mask)
            if len(transition)<6:
                print('errror in initialization!', transition)
            
            self.replay_buffer.append(transition)
            state = new_state

            if done:
                state= env.reset()
        
        print('Initialization with random transitions is done!')
      
          
    def add_data(self, data): 
        '''
        Params:
        data = relevant data of a transition, i.e. action, new_obs, reward, done
        '''
        self.replay_buffer.append(data)
            
    def sample(self, batch_size):
        
        '''
        Params:
        batch_size = number of transitions that will be sampled
        
        Returns:
        tensor of observations, actions, rewards, done (boolean) and next observation 
        '''
        
        transitions = random.sample(self.replay_buffer, batch_size)
        for t in transitions:
            if len(t)<6:
                print('error!', t)
        observations = np.asarray([t[0] for t in transitions])
        actions = np.asarray([t[1] for t in transitions])
        rewards = np.asarray([t[2] for t in transitions])
        dones = np.asarray([t[3] for t in transitions])
        new_observations = np.asarray([t[4] for t in transitions])        
        new_masks = np.asarray([t[5] for t in transitions])
    
        #PyTorch needs these arrays as tensors!, don't forget to specify the device! (cpu / GPU)
        observations_t = torch.as_tensor(observations, dtype = torch.float32, device=self.device)
        actions_t = torch.as_tensor(actions, dtype = torch.int64, device=self.device).unsqueeze(-1)
        rewards_t = torch.as_tensor(rewards, dtype = torch.float32, device=self.device).unsqueeze(-1)
        dones_t = torch.as_tensor(dones, dtype = torch.float32, device=self.device).unsqueeze(-1)
        new_observations_t = torch.as_tensor(new_observations, dtype = torch.float32, device=self.device)
        new_masks_t = torch.as_tensor(new_masks, dtype = torch.bool, device=self.device)

        
        return observations_t, actions_t, rewards_t, dones_t, new_observations_t, new_masks_t
    
    def add_reward(self, reward):
        
        '''
        Params:
        reward = reward that the agent earned during an episode of a game
        '''
        self.reward_buffer.append(reward)
        

In [46]:
class vanilla_DQNAgent:
    
    def __init__(self, env, device, epsilon_decay, 
                 epsilon_start, epsilon_end, discount_rate, lr, buffer_size, seed = 123):
        '''
        Params:
        env = environment that the agent needs to play
        device = set up to run CUDA operations
        epsilon_decay = Decay period until epsilon start -> epsilon end
        epsilon_start = starting value for the epsilon value
        epsilon_end = ending value for the epsilon value
        discount_rate = discount rate for future rewards
        lr = learning rate
        buffer_size = max number of transitions that the experience replay buffer can store
        seed = seed for random number generator for reproducibility
        '''
        self.env = env
        self.device = device
        self.epsilon_decay = epsilon_decay
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.discount_rate = discount_rate
        self.learning_rate = lr
        self.buffer_size = buffer_size
        
        self.replay_memory = ExperienceReplay(self.env, self.buffer_size)
        self.online_network = DQN(self.env, self.learning_rate).to(self.device)
        
    def choose_action(self, step, observation, greedy = False):
        
        '''
        Params:
        step = the specific step number 
        observation = observation input
        greedy = boolean that
        
        Returns:
        action: action chosen (either random or greedy)
        epsilon: the epsilon value that was used 
        '''
        
        epsilon = np.interp(step, [0, self.epsilon_decay], [self.epsilon_start, self.epsilon_end])
        random_sample = random.random()
        mask = self.env.mask()
        
        if (random_sample <= epsilon) and not greedy:
            #Random action
            action = np.random.choice(np.arange(self.env.action_space.n)[mask])
        else:
            #Greedy action
            obs_t = torch.as_tensor(observation, dtype = torch.float32, device=self.device)
            q_values = self.online_network(obs_t.unsqueeze(0))
            max_q_index = torch.as_tensor(torch.argmax(q_values.squeeze(0)[mask]).item() + np.where(mask)[0][0])
            action = max_q_index.detach().item()  
        return action, epsilon
    
    def learn(self, batch_size):
        
        '''
        Here we do gradient descent      
        Params:
        batch_size = number of transitions that will be sampled
        '''
        
        
        #Sample random transitions with size = batch size
        observations_t, actions_t, rewards_t, dones_t, new_observations_t, new_masks_t = self.replay_memory.sample(batch_size)
        target_q_values = self.online_network(new_observations_t)
        target_q_values[~new_masks_t] = -50
        max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0] 
        targets = rewards_t + self.discount_rate * (1-dones_t) * max_target_q_values
        

        #Compute loss
        q_values = self.online_network(observations_t)
        
        action_q_values = torch.gather(input=q_values, dim=1, index=actions_t)
        #Loss:  Huber loss
        loss = F.smooth_l1_loss(action_q_values, targets.detach())
        #Uncomment this line to use the standard MSE loss
        #loss = F.mse_loss(action_q_values, targets.detach())

        #Solution:
        #Gradient descent
        self.online_network.optimizer.zero_grad()
        loss.backward()
        self.online_network.optimizer.step()
        
    def predict(self):
        '''
        Params:
        step = the number of the step within the epsilon decay that is used for the epsilon value of epsilon-greedy
        seed = seed for random number generator for reproducibility
        '''
        
        action = self.choose_action(self.env.current_step, self.env.current_state, True)[0] 
        
        return action
    
        

In [47]:
#Set the hyperparameters

#Discount rate
discount_rate = 0.9
#That is the sample that we consider to update our algorithm
batch_size = 32
#Maximum number of transitions that we store in the buffer
buffer_size = 5000
#Minimum number of random transitions stored in the replay buffer
min_replay_size = 1000
#Starting value of epsilon
epsilon_start = 1.0
#End value (lowest value) of epsilon
epsilon_end = 0.1
#Decay period until epsilon start -> epsilon end
epsilon_decay = 10000

max_episodes = 250000

#Learning_rate
lr = 7e-4


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vanilla_agent = vanilla_DQNAgent(train_env, device, epsilon_decay, epsilon_start, epsilon_end, discount_rate, lr, buffer_size)


Please wait, the experience replay buffer will be filled with random transitions
Initialization with random transitions is done!


In [48]:
def training_loop(env, agent, max_episodes, target_ = False, seed=42):
    
    '''
    Params:
    env = the environment that the agent needs to play
    agent= which agent is used to train
    max_episodes = maximum number of games played
    target = boolean variable indicating if a target network is used (this will be clear later)
    seed = seed for random number generator for reproducibility
    
    Returns:
    average_reward_list = a list of averaged rewards over 100 episodes of playing the game
    '''

    env.action_space.seed(seed)
    obs=env.reset()
    average_reward_list = [0.0]
    episode_reward = 0.0
    
    for step in range(max_episodes):
        
        action, epsilon = agent.choose_action(step, obs)
       
        new_state, reward, done, available  = env.step(action)
        new_mask = env.mask()
        new_obs = new_state
        
        transition = (obs, action, reward, done, new_obs, new_mask)
        agent.replay_memory.add_data(transition)
        obs = new_obs
    
        episode_reward += reward
    
        if done:
            obs= env.reset()
            agent.replay_memory.add_reward(episode_reward)
            #Reinitilize the reward to 0.0 after the game is over
            episode_reward = 0.0

        #Learn
        agent.learn(batch_size)

        #Calculate after each 100 episodes an average that will be added to the list
                
        if (step+1) % 100 == 0:
            average_reward_list.append(np.mean(agent.replay_memory.reward_buffer))
        
        #Update target network, do not bother about it now!
        if target_:
            
            #Set the target_update_frequency
            target_update_frequency = 250
            if step % target_update_frequency == 0:
                dagent.update_target_network()
    
        #Print some output
        if (step+1) % 10000 == 0:
            print(20*'--')
            print('Step', step)
            print('Epsilon', epsilon)
            print('Avg Rew', np.mean(agent.replay_memory.reward_buffer))
            print()
            
    return average_reward_list

average_rewards_vanilla_dqn = training_loop(train_env, vanilla_agent, max_episodes)

----------------------------------------
Step 9999
Epsilon 0.1000899999999999
Avg Rew 0.0

----------------------------------------
Step 19999
Epsilon 0.1
Avg Rew 0.0

----------------------------------------
Step 29999
Epsilon 0.1
Avg Rew -1751.3271300000013

----------------------------------------
Step 39999
Epsilon 0.1
Avg Rew -1751.3271300000013

----------------------------------------
Step 49999
Epsilon 0.1
Avg Rew -1882.1347666666663

----------------------------------------
Step 59999
Epsilon 0.1
Avg Rew -1882.1347666666663

----------------------------------------
Step 69999
Epsilon 0.1
Avg Rew -1962.2977349999992

----------------------------------------
Step 79999
Epsilon 0.1
Avg Rew -1962.2977349999992

----------------------------------------
Step 89999
Epsilon 0.1
Avg Rew -2005.888204

----------------------------------------
Step 99999
Epsilon 0.1
Avg Rew -2005.888204

----------------------------------------
Step 109999
Epsilon 0.1
Avg Rew -2012.9473899999994

--------

In [49]:
eval_env = SmartGridEnv(val)

vanilla_agent.env = eval_env
i=0
while not vanilla_agent.env.done:
    current_state = tuple(vanilla_agent.env.current_state)
    action = vanilla_agent.predict()
    if i<100:
        print(f"at time {vanilla_agent.env.current_hour +  1} agent transacts {vanilla_agent.env.to_discrete(action)} KWh, battery is {vanilla_agent.env.current_battery}")
    next_state, reward, done, available = vanilla_agent.env.step(action)
    i=i+1

print("Profit on validation set: ", eval_env.profit)

at time 1 agent transacts -18 KWh, battery is 20
at time 2 agent transacts 8 KWh, battery is 0
at time 3 agent transacts 19 KWh, battery is 7
at time 4 agent transacts 8 KWh, battery is 24
at time 5 agent transacts 8 KWh, battery is 31
at time 6 agent transacts 8 KWh, battery is 38
at time 7 agent transacts 1 KWh, battery is 45
at time 8 agent transacts 1 KWh, battery is 46
at time 18 agent transacts 19 KWh, battery is 26
at time 19 agent transacts 1 KWh, battery is 43
at time 20 agent transacts 1 KWh, battery is 44
at time 21 agent transacts 1 KWh, battery is 45
at time 22 agent transacts 1 KWh, battery is 46
at time 23 agent transacts 1 KWh, battery is 47
at time 24 agent transacts 1 KWh, battery is 48
at time 1 agent transacts -18 KWh, battery is 49
at time 2 agent transacts -18 KWh, battery is 29
at time 3 agent transacts -1 KWh, battery is 9
at time 4 agent transacts -1 KWh, battery is 8
at time 5 agent transacts 19 KWh, battery is 7
at time 6 agent transacts 8 KWh, battery is 24


In [50]:
# class DDQNAgent:
    
#     def __init__(self, env, device, epsilon_decay, 
#                  epsilon_start, epsilon_end, discount_rate, lr, buffer_size, seed = 123):
#         '''
#         Params:
#         env = environment that the agent needs to play
#         device = set up to run CUDA operations
#         epsilon_decay = Decay period until epsilon start -> epsilon end
#         epsilon_start = starting value for the epsilon value
#         epsilon_end = ending value for the epsilon value
#         discount_rate = discount rate for future rewards
#         lr = learning rate
#         buffer_size = max number of transitions that the experience replay buffer can store
#         seed = seed for random number generator for reproducibility
#         '''
#         self.env = env
#         self.device = device
#         self.epsilon_decay = epsilon_decay
#         self.epsilon_start = epsilon_start
#         self.epsilon_end = epsilon_end
#         self.discount_rate = discount_rate
#         self.learning_rate = lr
#         self.buffer_size = buffer_size
        
#         self.replay_memory = ExperienceReplay(self.env, self.buffer_size)
#         self.online_network = DQN(self.env, self.learning_rate).to(self.device)
        
#         self.target_network = DQN(self.env, self.learning_rate).to(self.device)
#         self.target_network.load_state_dict(self.online_network.state_dict())
    
#     def choose_action(self, step, observation, greedy = False):
        
#         '''
#         Params:
#         step = the specific step number 
#         observation = observation input
#         greedy = boolean that
        
#         Returns:
#         action: action chosen (either random or greedy)
#         epsilon: the epsilon value that was used 
#         '''
        
#         epsilon = np.interp(step, [0, self.epsilon_decay], [self.epsilon_start, self.epsilon_end])
#         random_sample = random.random()
#         mask = self.env.mask()
        
#         if (random_sample <= epsilon) and not greedy:
#             #Random action
#             action = np.random.choice(np.arange(self.env.action_space.n)[mask])
#         else:
#             #Greedy action
#             obs_t = torch.as_tensor(observation, dtype = torch.float32, device=self.device)
#             q_values = self.online_network(obs_t.unsqueeze(0))
#             max_q_index = torch.as_tensor(torch.argmax(q_values.squeeze(0)[mask]).item() + np.where(mask)[0][0])
#             action = max_q_index.detach().item()  
#         return action, epsilon
    
#     def learn(self, batch_size):
        
#         '''
#         Here we do gradient descent      
#         Params:
#         batch_size = number of transitions that will be sampled
#         '''
        
        
#         #Sample random transitions with size = batch size
#         observations_t, actions_t, rewards_t, dones_t, new_observations_t, new_masks_t = self.replay_memory.sample(batch_size)
#         target_q_values = self.online_network(new_observations_t)
#         target_q_values[~new_masks_t] = -50
#         max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0] 
#         targets = rewards_t + self.discount_rate * (1-dones_t) * max_target_q_values
        

#         #Compute loss
#         q_values = self.online_network(observations_t)
        
#         action_q_values = torch.gather(input=q_values, dim=1, index=actions_t)
#         #Loss:  Huber loss
#         loss = F.smooth_l1_loss(action_q_values, targets.detach())
#         #Uncomment this line to use the standard MSE loss
#         #loss = F.mse_loss(action_q_values, targets.detach())

#         #Solution:
#         #Gradient descent
#         self.online_network.optimizer.zero_grad()
#         loss.backward()
#         self.online_network.optimizer.step()
        
#     def predict(self):
#         '''
#         Params:
#         step = the number of the step within the epsilon decay that is used for the epsilon value of epsilon-greedy
#         seed = seed for random number generator for reproducibility
#         '''
        
#         action = self.choose_action(self.env.current_step, self.env.current_state, True)[0] 
        
#     def update_target_network(self):
#         self.target_network.load_state_dict(self.online_network.state_dict())
    
    

In [51]:
# dagent = DDQNAgent(train_env, device, epsilon_decay, epsilon_start, epsilon_end, discount_rate, lr, buffer_size)


In [52]:
# average_rewards_ddqn = training_loop(train_env, dagent, max_episodes, target_ = True) 

In [53]:
# eval_env = SmartGridEnv(val)

# dagent.env = eval_env
# i=0
# while not dagent.env.done:
#     current_state = tuple(dagent.env.current_state)
#     action = dagent.predict()
#     if i<100:
#         print(f"at time {dagent.env.current_hour +  1} agent transacts {dagent.env.to_discrete(action)} KWh, battery is {dagent.env.current_battery}")
#     next_state, reward, done, available = dagent.env.step(action)
#     i=i+1

# print("Profit on validation set: ", eval_env.profit)