In [33]:
import random
from typing import List

class SampleEnvironment:
    def __init__(self):
        self.steps_left = 20 # maximum number of steps agent can take to get the reward
        
    def get_observation(self) -> List[float]:
        return[0.0, 0.0, 0.0] # hardcoded information (logic to be implemented) that the envirment returns
    
    def get_actions(self) -> List[int]:
        return random.choice([0, 1])
    
    def is_done(self) -> bool:
        return self.steps_left == 0 # when steps have been completed, the agent should not be moving anymore
    
    def get_reward(self, action: int) -> float: # takes an action int[0,1] as input, returns a reward float[0,1] as output
        if self.is_done():
            raise Exception('Game is Over')
        self.steps_left -= 1
        if action == 0:
            return random.random()
        else:
            return 2*random.random()
        

In [38]:
class Agent:
    def __init__(self):
        self.total_reward = 0.0 # initial assigned reward
        
    def step(self, env: SampleEnvironment): # it takes the class SampleEnvironment as input parameter
        current_obs = env.get_observation()
        print('Observation:',current_obs)
        action = env.get_actions()
        print('Action:',action)
        reward = env.get_reward(action) # randomly pick a positive or negative reward
        print('Reward:',reward)
        self.total_reward += reward
        print('Accumulated Reward:',self.total_reward)

In [39]:
if __name__ == '__main__':
    env = SampleEnvironment()
    agent = Agent()
    i = 0
    
    while not env.is_done():
        i = i+1
        print('.........')
        print('Step:',i)
        agent.step(env)
    
    print('----------------------')
    print('Total reward:', agent.total_reward)

.........
Step: 1
Observation: [0.0, 0.0, 0.0]
Action: 0
Reward: 0.05329683873870794
Accumulated Reward: 0.05329683873870794
.........
Step: 2
Observation: [0.0, 0.0, 0.0]
Action: 1
Reward: 0.03166725033558504
Accumulated Reward: 0.08496408907429298
.........
Step: 3
Observation: [0.0, 0.0, 0.0]
Action: 0
Reward: 0.556438996862566
Accumulated Reward: 0.641403085936859
.........
Step: 4
Observation: [0.0, 0.0, 0.0]
Action: 0
Reward: 0.3146600047697795
Accumulated Reward: 0.9560630907066385
.........
Step: 5
Observation: [0.0, 0.0, 0.0]
Action: 0
Reward: 0.6420326379648259
Accumulated Reward: 1.5980957286714643
.........
Step: 6
Observation: [0.0, 0.0, 0.0]
Action: 1
Reward: 1.4502170889496833
Accumulated Reward: 3.048312817621148
.........
Step: 7
Observation: [0.0, 0.0, 0.0]
Action: 0
Reward: 0.7297008695084356
Accumulated Reward: 3.7780136871295835
.........
Step: 8
Observation: [0.0, 0.0, 0.0]
Action: 0
Reward: 0.4019478114198054
Accumulated Reward: 4.179961498549389
.........
Step: 

In [36]:
# import random
# random.random()