In [1]:
#https://python.langchain.com.cn/docs/use_cases/agent_simulations/gymnasium


In [2]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
from dotenv import load_dotenv  

import inspect
import tenacity

from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage,
    BaseMessage,
)
from langchain.output_parsers import RegexParser
load_dotenv('API_KEYS.env')  

llm=ChatOpenAI(temperature=0.2)

llm = ChatGroq(
    model="llama3-70b-8192",                  # Correct model name
    temperature=0.7,
    max_tokens=512
)

### The Agent

In [3]:
class GymnasiumAgent:
    @classmethod
    def get_docs(cls, env):
        return env.unwrapped.__doc__

    def __init__(self, model, env):
        self.model = model
        self.env = env
        self.docs = self.get_docs(env)

        self.instructions = """
Your goal is to maximize your return, i.e. the sum of the rewards you receive.
You are playing blackjack in the environment gymnasium for reinforcement learning
The game starts with the dealer having one face up and one face down card, while the player has two face up cards. All cards are drawn from an infinite deck (i.e. with replacement).

The card values are:

    Face cards (Jack, Queen, King) have a point value of 10.

    Aces can either count as 11 (called a ‘usable ace’) or 1.

    Numerical cards (2-10) have a value equal to their number.

The player has the sum of cards held. The player can request additional cards (hit) until they decide to stop (stick) or exceed 21 (bust, immediate loss).

After the player sticks, the dealer reveals their facedown card, and draws cards until their sum is 17 or greater. If the dealer goes bust, the player wins.

If neither the player nor the dealer busts, the outcome (win, lose, draw) is decided by whose sum is closer to 21.

I will give you an observation, reward, terminiation flag, truncation flag, and the return so far, formatted as:

Observation: <observation>
Reward: <reward>
Termination: <termination>
Truncation: <truncation>
Return: <sum_of_rewards>

You will respond with an action, formatted as:

Action: <action>

where you replace <action> with your actual action.
Do nothing else but return the action. don't explain your reasoning
Respond a number 0 is Stick. 1 is hit 
"""
        self.action_parser = RegexParser(
            regex=r"Action: (.*)", output_keys=["action"], default_output_key="action"
        )

        self.message_history = []
        self.ret = 0

    def random_action(self):
        action = self.env.action_space.sample()
        return action

    def reset(self):
        self.message_history = [
            SystemMessage(content=self.docs),
            SystemMessage(content=self.instructions),
        ]

    def observe(self, obs, rew=0, term=False, trunc=False, info=None):
        self.ret += rew

        obs_message = f"""
Observation: {obs}
Reward: {rew}
Termination: {term}
Truncation: {trunc}
Return: {self.ret}
        """
        self.message_history.append(HumanMessage(content=obs_message))
        return obs_message

    def _act(self):

        act_message = self.model.invoke(self.message_history)
        self.message_history.append(act_message)

#        action = int(self.action_parser.parse(act_message)["action"])
        action = int(self.action_parser.parse(act_message.content)["action"]) # para ChatGPT
        return action

    def act(self):
        try:
            for attempt in tenacity.Retrying(
                stop=tenacity.stop_after_attempt(2),
                wait=tenacity.wait_none(),  # No waiting time between retries
                retry=tenacity.retry_if_exception_type(ValueError),
                before_sleep=lambda retry_state: print(
                    f"ValueError occurred: {retry_state.outcome.exception()}, retrying..."
                ),
            ):
                with attempt:
                    action = self._act()
        except tenacity.RetryError as e:
            action = self.random_action()
        return action

In [4]:
env = gym.make("Blackjack-v1")
#agent = GymnasiumAgent(model=ChatOpenAI(temperature=0.2), env=env)
agent = GymnasiumAgent(model=llm, env=env)


In [5]:
won, lost, draw = 0,0,0

for i_episode in range(100):
    agent.reset()
    obs, _ = env.reset()
    obs_message = agent.observe(obs)
    print(obs_message)
    
    while True:
        action = agent.act()
        print(f"Action: {action}")
        
        observation, reward, termination, truncation, info = env.step(action)
        obs_message = agent.observe(observation, reward, termination, truncation, info)
        
        print(obs_message)

        if termination or truncation:
            print(f"End game!  episode: {i_episode}, Reward:{reward}")
            if reward > 0:
                print('You won :)\n')
                won= won+1 
            elif reward == 0:
                print ('Draw')
                draw = draw + 1
            else:
                print('You lost :(\n')
                lost = lost + 1
            break
            
        
print(i_episode, 'total won', won, 'total lost', lost, 'total draw', draw)


Observation: (18, 5, 1)
Reward: 0
Termination: False
Truncation: False
Return: 0
        
Action: 0

Observation: (18, 5, 1)
Reward: 1.0
Termination: True
Truncation: False
Return: 1.0
        
End game!  episode0, Reward:1.0
You won :)


Observation: (14, 8, 0)
Reward: 0
Termination: False
Truncation: False
Return: 1.0
        
Action: 1

Observation: (22, 8, 0)
Reward: -1.0
Termination: True
Truncation: False
Return: 0.0
        
End game!  episode1, Reward:-1.0
You lost :(


Observation: (8, 1, 0)
Reward: 0
Termination: False
Truncation: False
Return: 0.0
        
Action: 1

Observation: (14, 1, 0)
Reward: 0.0
Termination: False
Truncation: False
Return: 0.0
        
Action: 1

Observation: (15, 1, 0)
Reward: 0.0
Termination: False
Truncation: False
Return: 0.0
        
Action: 1

Observation: (25, 1, 0)
Reward: -1.0
Termination: True
Truncation: False
Return: -1.0
        
End game!  episode2, Reward:-1.0
You lost :(


Observation: (17, 10, 0)
Reward: 0
Termination: False
Truncati

Action: 1

Observation: (17, 1, 0)
Reward: 0.0
Termination: False
Truncation: False
Return: -9.0
        
Action: 0

Observation: (17, 1, 0)
Reward: -1.0
Termination: True
Truncation: False
Return: -10.0
        
End game!  episode28, Reward:-1.0
You lost :(


Observation: (20, 4, 0)
Reward: 0
Termination: False
Truncation: False
Return: -10.0
        
Action: 0

Observation: (20, 4, 0)
Reward: 1.0
Termination: True
Truncation: False
Return: -9.0
        
End game!  episode29, Reward:1.0
You won :)


Observation: (7, 9, 0)
Reward: 0
Termination: False
Truncation: False
Return: -9.0
        
Action: 1

Observation: (10, 9, 0)
Reward: 0.0
Termination: False
Truncation: False
Return: -9.0
        
Action: 1

Observation: (12, 9, 0)
Reward: 0.0
Termination: False
Truncation: False
Return: -9.0
        
Action: 1

Observation: (22, 9, 0)
Reward: -1.0
Termination: True
Truncation: False
Return: -10.0
        
End game!  episode30, Reward:-1.0
You lost :(


Observation: (16, 6, 0)
Reward: 0
T

Action: 1

Observation: (17, 10, 0)
Reward: 0.0
Termination: False
Truncation: False
Return: -2.0
        
Action: 0

Observation: (17, 10, 0)
Reward: -1.0
Termination: True
Truncation: False
Return: -3.0
        
End game!  episode55, Reward:-1.0
You lost :(


Observation: (12, 10, 0)
Reward: 0
Termination: False
Truncation: False
Return: -3.0
        
Action: 1

Observation: (22, 10, 0)
Reward: -1.0
Termination: True
Truncation: False
Return: -4.0
        
End game!  episode56, Reward:-1.0
You lost :(


Observation: (13, 7, 1)
Reward: 0
Termination: False
Truncation: False
Return: -4.0
        
Action: 1

Observation: (18, 7, 1)
Reward: 0.0
Termination: False
Truncation: False
Return: -4.0
        
Action: 0

Observation: (18, 7, 1)
Reward: 1.0
Termination: True
Truncation: False
Return: -3.0
        
End game!  episode57, Reward:1.0
You won :)


Observation: (21, 10, 1)
Reward: 0
Termination: False
Truncation: False
Return: -3.0
        
Action: 0

Observation: (21, 10, 1)
Reward: 1

Action: 1

Observation: (19, 8, 0)
Reward: 0.0
Termination: False
Truncation: False
Return: -3.0
        
Action: 0

Observation: (19, 8, 0)
Reward: 1.0
Termination: True
Truncation: False
Return: -2.0
        
End game!  episode82, Reward:1.0
You won :)


Observation: (10, 4, 0)
Reward: 0
Termination: False
Truncation: False
Return: -2.0
        
Action: 1

Observation: (20, 4, 0)
Reward: 0.0
Termination: False
Truncation: False
Return: -2.0
        
Action: 0

Observation: (20, 4, 0)
Reward: -1.0
Termination: True
Truncation: False
Return: -3.0
        
End game!  episode83, Reward:-1.0
You lost :(


Observation: (18, 10, 0)
Reward: 0
Termination: False
Truncation: False
Return: -3.0
        
Action: 0

Observation: (18, 10, 0)
Reward: 1.0
Termination: True
Truncation: False
Return: -2.0
        
End game!  episode84, Reward:1.0
You won :)


Observation: (19, 3, 0)
Reward: 0
Termination: False
Truncation: False
Return: -2.0
        
Action: 0

Observation: (19, 3, 0)
Reward: 1.0
Term

In [7]:
print(f"Total episodes: {i_episode+1} Won: {(won/i_episode):.2%}, Lost: {(lost/i_episode):.2%}, Draw: {(draw/i_episode):.2%}")

Total episodes: 100 Won: 45.45%, Lost: 49.49%, Draw: 6.06%
