#### Python Code for Local DeepSeek Chat versions via LM Studio

##### Source code: https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/classic_control/cartpole.py

##### Import libraries

In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import requests
import time
import pygame
import re
import json

##### LLM-Based Agent  DeepSeek Chat (Prompt Engineering)

In [24]:
class LMStudio_Agent:
    def __init__(self, env_name):
        self.env_name = env_name
        self.api_url = "http://localhost:1234/v1/completions"  # LM Studio API Endpoint

    def select_action(self, state):
        # Shorter prompt
        prompt = f"""
        Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: {state[0]:.3f}
        Cart Velocity: {state[1]:.3f}
        Pole Angle: {state[2]:.3f}
        Pole Angular Velocity: {state[3]:.3f}

        Select the best action:
        (0) Move LEFT
        (1) Move RIGHT

        ONLY reply with `0` or `1` on a new line.
        DO NOT leave the response blank.
        DO NOT include explanations.
        """

        print("Sending Prompt:", prompt)  # Debugging

        headers = {"Content-Type": "application/json"}
        data = {
            "model": "deepseek-llm-7B-chat-GGUF",  # Ensure it's an instruction-tuned model
            "prompt": prompt,
            "max_tokens": 5,  # Ensures a short response
            "temperature": 0.7,  # Make the model more willing to respond
            "top_p": 0.9,  # Increase sampling variety
            "stop": ["\n"]  # Prevents unnecessary text generation
        }


        try:
            response = requests.post(self.api_url, headers=headers, json=data)

            # Check if API call was successful
            if response.status_code != 200:
                print(f"API Error: {response.status_code} - {response.text}")
                return 0  # Default action on failure
            
            response_json = response.json()
            print("API Response:", response_json)  # Debug response

            # Extract action using regex
            match = re.search(r'\b[01]\b', response_json.get("choices", [{}])[0].get("text", ""))
            action = int(match.group(0)) if match else 0

            print(f"Selected Action: {action}")  # Debug action
            return action

        except Exception as e:
            print(f"Error communicating with LM Studio: {e}")
            return 0  # Default safe action


##### Run Experiment

In [25]:
def run_experiment():
    env = gym.make("CartPole-v1")
    llm_agent = LMStudio_Agent("CartPole-v1")

    episodes = 10
    llm_scores = []

    print("\nRunning LLM Agent...")
    for episode in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        while not done:
            action = llm_agent.select_action(state)
            print(f"Action: {action}")
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            total_reward += reward
            time.sleep(0.1)
        llm_scores.append(total_reward)
        print(f"LLM Episode {episode + 1}: Score = {total_reward}")

    env.close()


In [26]:
if __name__ == "__main__":
    run_experiment()


Running LLM Agent...
Sending Prompt: 
        Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: -0.043
        Cart Velocity: -0.008
        Pole Angle: -0.037
        Pole Angular Velocity: 0.042

        Select the best action:
        (0) Move LEFT
        (1) Move RIGHT

        ONLY reply with `0` or `1` on a new line.
        DO NOT leave the response blank.
        DO NOT include explanations.
        
API Response: {'id': 'cmpl-2v6afhgicjnkme0o2xj0oq', 'object': 'text_completion', 'created': 1741178158, 'model': 'deepseek-llm-7b-chat', 'choices': [{'index': 0, 'text': '', 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 135, 'completion_tokens': 0, 'total_tokens': 135}}
Selected Action: 0
Action: 0
Sending Prompt: 
        Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: -0.044
        Cart Velocity: -0.203
        

KeyboardInterrupt: 

#### Second code attempt from ChatGPT

##### Import libraries

In [13]:
import gymnasium as gym
import requests
import json
import time
import matplotlib.pyplot as plt

##### LLM API details

In [14]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
MODEL_NAME = "deepseek-llm-7B-chat-GGUF"  # Adjust if needed

##### Definition of query for LLM

In [16]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    prompt = f"""
    You are controlling a CartPole in OpenAI Gymnasium.
    The current state: [cart_position, cart_velocity, pole_angle, pole_velocity].
    
    Observation: {observation}
    
    Respond with ONLY '0' or '1' and nothing else.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": "You are a reinforcement learning agent for CartPole."},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10,
        "stop": ["\n"],
    }

    response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
    try:
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        print("LLM Response:", response_json)

        raw_result = response_json["choices"][0]["message"]["content"].strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        action = int(raw_result[0]) if raw_result and raw_result[0] in ["0", "1"] else 0
        print(f"Final Action: {action}")
    except Exception as e:
        print("Error processing LLM response:", e)
        action = 0  # Default action in case of failure

    return action

#### Running carpole

In [17]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    observation, info = env.reset()
    
    total_reward = 0
    for step in range(200):  # Maximum steps per episode
        action = query_llm(observation)
        observation, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        env.render()
        if terminated or truncated:
            break  # End episode if the pole falls or max steps reached

        time.sleep(0.05)  # Small delay for better visualization

    env.close()
    print(f"Total reward: {total_reward}")




In [19]:
if __name__ == "__main__":
    run_cartpole()


LLM Response: {'id': 'chatcmpl-c3vmyqdkyqg1ootj2tcphn', 'object': 'chat.completion', 'created': 1741181876, 'model': 'deepseek-llm-7b-chat', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': '1'}}], 'usage': {'prompt_tokens': 138, 'completion_tokens': 1, 'total_tokens': 139}, 'system_fingerprint': 'deepseek-llm-7b-chat'}
Raw LLM Response: 1
Final Action: 1
LLM Response: {'id': 'chatcmpl-j3bdkm94u8pp8n45ztlhk', 'object': 'chat.completion', 'created': 1741181883, 'model': 'deepseek-llm-7b-chat', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'length', 'message': {'role': 'assistant', 'content': 'The observation provided is for the CartPole'}}], 'usage': {'prompt_tokens': 137, 'completion_tokens': 9, 'total_tokens': 146}, 'system_fingerprint': 'deepseek-llm-7b-chat'}
Raw LLM Response: The observation provided is for the CartPole
Final Action: 0
LLM Response: {'id': 'chatcmpl-oc619q1kkuc256207wiil1', 'object': 'chat

##### Change in Prompt -- details of the observation

In [20]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    prompt = f"""
    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: {observation[0]:.3f}
        Cart Velocity: {observation[1]:.3f}
        Pole Angle: {observation[2]:.3f}
        Pole Angular Velocity: {observation[3]:.3f}
    
    Observation: {observation}
    
    Respond with ONLY '0' or '1' and nothing else.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": "You are a reinforcement learning agent for CartPole."},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10,
        "stop": ["\n"],
    }

    response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
    try:
        print(prompt)  # Debugging 
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        print("LLM Response:", response_json)

        raw_result = response_json["choices"][0]["message"]["content"].strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        action = int(raw_result[0]) if raw_result and raw_result[0] in ["0", "1"] else 0
        print(f"Final Action: {action}")
    except Exception as e:
        print("Error processing LLM response:", e)
        action = 0  # Default action in case of failure

    return action

In [21]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    observation, info = env.reset()
    
    total_reward = 0
    for step in range(200):  # Maximum steps per episode
        action = query_llm(observation)
        observation, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        env.render()
        if terminated or truncated:
            break  # End episode if the pole falls or max steps reached

        time.sleep(0.05)  # Small delay for better visualization

    env.close()
    print(f"Total reward: {total_reward}")

In [22]:
if __name__ == "__main__":
    run_cartpole()



    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: 0.024
        Cart Velocity: 0.037
        Pole Angle: -0.044
        Pole Angular Velocity: 0.002
    
    Observation: [ 0.0243403   0.03736877 -0.0442324   0.00231141]
    
    Respond with ONLY '0' or '1' and nothing else.
    
LLM Response: {'id': 'chatcmpl-o38lvf5pwsk5h9kcwsonr', 'object': 'chat.completion', 'created': 1741204879, 'model': 'deepseek-llm-7b-chat', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': '0'}}], 'usage': {'prompt_tokens': 172, 'completion_tokens': 1, 'total_tokens': 173}, 'system_fingerprint': 'deepseek-llm-7b-chat'}
Raw LLM Response: 0
Final Action: 0

    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: 0.025
        Cart Velocity: -0.157
        Pole Angle: -0.044
        Pole Angular Velocity: 0.28

#### LLM-Agent DeepSeek Coder

In [30]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
MODEL_NAME = "DeepSeek-Coder-V2-Lite-Instruct-GGUF"  # Adjust if needed
# MODEL_NAME = "deepseek-llm-7B-chat-GGUF"  # Adjust if needed


In [31]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    prompt = f"""
    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: {observation[0]:.3f} # Min=-4.8 and Max=4.8
        Cart Velocity: {observation[1]:.3f} # Min=-Inf and Max=Inf
        Pole Angle: {observation[2]:.3f} # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: {observation[3]:.3f} # Min=-Inf and Max=Inf
    
    Observation: {observation}
    
    Respond with ONLY '0' or '1' and nothing else.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": "You are a reinforcement learning agent for CartPole."},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10,
        "stop": ["\n"],
    }

    try:
        print(prompt)  # Debugging 
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        print("LLM Response:", response_json)

        # Extract raw response
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        #action = int(raw_result[0]) if raw_result and raw_result[0] in ["0", "1"] else 0
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        raw_result = raw_result.split()[0]  # Ensure we take the first valid word
        action = 1 if raw_result == "1" else 0  # Force a binary output
        
        print(f"Final Action: {action}")
        return action  # ✅ Ensure the function returns an action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0  # Default action in case of failure

In [32]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    observation, info = env.reset()
    
    total_reward = 0
    for step in range(200):  # Maximum steps per episode
        action = query_llm(observation)
        observation, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        env.render()
        if terminated or truncated:
            break  # End episode if the pole falls or max steps reached

        time.sleep(0.05)  # Small delay for better visualization

    env.close()
    print(f"Total reward: {total_reward}")

In [33]:
if __name__ == "__main__":
    run_cartpole()


    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: 0.049 # Min=-4.8 and Max=4.8
        Cart Velocity: -0.005 # Min=-Inf and Max=Inf
        Pole Angle: -0.009 # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: 0.043 # Min=-Inf and Max=Inf
    
    Observation: [ 0.04863442 -0.00488335 -0.00856156  0.04293367]
    
    Respond with ONLY '0' or '1' and nothing else.
    
LLM Response: {'id': 'chatcmpl-rnfk9z0w8s3o3q0rqx7v5', 'object': 'chat.completion', 'created': 1741257763, 'model': 'deepseek-coder-v2-lite-instruct', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': ' 1'}}], 'usage': {'prompt_tokens': 223, 'completion_tokens': 2, 'total_tokens': 225}, 'system_fingerprint': 'deepseek-coder-v2-lite-instruct'}
Raw LLM Response: 1
Final Action: 1

    Your task is to control a cart in CartPole-v1 to keep the pole 

#### LLM-Agent Vicuna

In [34]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
MODEL_NAME = "vicuna-13B-v1.5-GGUF"  # Adjust if needed

In [35]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    prompt = f"""
    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: {observation[0]:.3f} # Min=-4.8 and Max=4.8
        Cart Velocity: {observation[1]:.3f} # Min=-Inf and Max=Inf
        Pole Angle: {observation[2]:.3f} # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: {observation[3]:.3f} # Min=-Inf and Max=Inf
    
    Observation: {observation}
    
    Respond with ONLY '0' or '1' and nothing else.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": "You are a reinforcement learning agent for CartPole."},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10,
        "stop": ["\n"],
    }

    try:
        print(prompt)  # Debugging 
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        print("LLM Response:", response_json)

        # Extract raw response
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        #action = int(raw_result[0]) if raw_result and raw_result[0] in ["0", "1"] else 0
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        raw_result = raw_result.split()[0]  # Ensure we take the first valid word
        action = 1 if raw_result == "1" else 0  # Force a binary output
        
        print(f"Final Action: {action}")
        return action  # ✅ Ensure the function returns an action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0  # Default action in case of failure

In [36]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    observation, info = env.reset()
    
    total_reward = 0
    for step in range(200):  # Maximum steps per episode
        action = query_llm(observation)
        observation, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        env.render()
        if terminated or truncated:
            break  # End episode if the pole falls or max steps reached

        time.sleep(0.05)  # Small delay for better visualization

    env.close()
    print(f"Total reward: {total_reward}")

In [37]:
if __name__ == "__main__":
    run_cartpole()


    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: 0.049 # Min=-4.8 and Max=4.8
        Cart Velocity: -0.018 # Min=-Inf and Max=Inf
        Pole Angle: -0.000 # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: -0.041 # Min=-Inf and Max=Inf
    
    Observation: [ 0.04862015 -0.01751299 -0.00034378 -0.04132786]
    
    Respond with ONLY '0' or '1' and nothing else.
    
LLM Response: {'id': 'chatcmpl-scr242x4nps2d6yccrxukp', 'object': 'chat.completion', 'created': 1741258139, 'model': 'vicuna-13b-v1.5', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': ''}}], 'usage': {'prompt_tokens': 239, 'completion_tokens': 0, 'total_tokens': 239}, 'system_fingerprint': 'vicuna-13b-v1.5'}
Raw LLM Response: 
Error processing LLM response: list index out of range

    Your task is to control a cart in CartPole-v1 to keep the

#### LLM-Agent Qwen2.5 7b Coder instruct

In [38]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
MODEL_NAME = "Qwen2.5-Coder-7B-Instruct-GGUF"  # Adjust if needed

In [39]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    prompt = f"""
    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: {observation[0]:.3f} # Min=-4.8 and Max=4.8
        Cart Velocity: {observation[1]:.3f} # Min=-Inf and Max=Inf
        Pole Angle: {observation[2]:.3f} # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: {observation[3]:.3f} # Min=-Inf and Max=Inf
    
    Observation: {observation}
    
    Respond with ONLY '0' or '1' and nothing else.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": "You are a reinforcement learning agent for CartPole."},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10,
        "stop": ["\n"],
    }

    try:
        print(prompt)  # Debugging 
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        print("LLM Response:", response_json)

        # Extract raw response
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        #action = int(raw_result[0]) if raw_result and raw_result[0] in ["0", "1"] else 0
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        raw_result = raw_result.split()[0]  # Ensure we take the first valid word
        action = 1 if raw_result == "1" else 0  # Force a binary output
        
        print(f"Final Action: {action}")
        return action  # ✅ Ensure the function returns an action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0  # Default action in case of failure

In [40]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    observation, info = env.reset()
    
    total_reward = 0
    for step in range(200):  # Maximum steps per episode
        action = query_llm(observation)
        observation, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        env.render()
        if terminated or truncated:
            break  # End episode if the pole falls or max steps reached

        time.sleep(0.05)  # Small delay for better visualization

    env.close()
    print(f"Total reward: {total_reward}")

In [41]:
if __name__ == "__main__":
    run_cartpole()


    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: 0.033 # Min=-4.8 and Max=4.8
        Cart Velocity: 0.023 # Min=-Inf and Max=Inf
        Pole Angle: 0.026 # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: -0.017 # Min=-Inf and Max=Inf
    
    Observation: [ 0.03283882  0.02259768  0.02591376 -0.01703708]
    
    Respond with ONLY '0' or '1' and nothing else.
    
LLM Response: {'id': 'chatcmpl-rmb9sz12hmb9eaka0eg4aw', 'object': 'chat.completion', 'created': 1741258909, 'model': 'qwen2.5-coder-7b-instruct', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': '1'}}], 'usage': {'prompt_tokens': 216, 'completion_tokens': 1, 'total_tokens': 217}, 'system_fingerprint': 'qwen2.5-coder-7b-instruct'}
Raw LLM Response: 1
Final Action: 1

    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

  

#### LLM Agent Mistral AI 7b Instruct

In [54]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
MODEL_NAME = "Mistral-7B-Instruct-v0.2-GGUF"  # Adjust if needed

In [55]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    prompt = f"""
    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: {observation[0]:.3f} # Min=-4.8 and Max=4.8
        Cart Velocity: {observation[1]:.3f} # Min=-Inf and Max=Inf
        Pole Angle: {observation[2]:.3f} # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: {observation[3]:.3f} # Min=-Inf and Max=Inf
    
    Observation: {observation}
    
    Respond with ONLY '0' or '1' and nothing else.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10,
        "stop": ["\n"],
    }

    try:
        print(prompt)  # Debugging 
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        print("LLM Response:", response_json)

        # Extract raw response
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        #action = int(raw_result[0]) if raw_result and raw_result[0] in ["0", "1"] else 0
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        raw_result = raw_result.split()[0]  # Ensure we take the first valid word
        action = 1 if raw_result == "1" else 0  # Force a binary output
        
        print(f"Final Action: {action}")
        return action  # ✅ Ensure the function returns an action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0  # Default action in case of failure

In [56]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    observation, info = env.reset()
    
    total_reward = 0
    for step in range(200):  # Maximum steps per episode
        action = query_llm(observation)
        observation, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        env.render()
        if terminated or truncated:
            break  # End episode if the pole falls or max steps reached

        time.sleep(0.05)  # Small delay for better visualization

    env.close()
    print(f"Total reward: {total_reward}")

In [57]:
if __name__ == "__main__":
    run_cartpole()


    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: -0.021 # Min=-4.8 and Max=4.8
        Cart Velocity: 0.005 # Min=-Inf and Max=Inf
        Pole Angle: -0.034 # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: 0.022 # Min=-Inf and Max=Inf
    
    Observation: [-0.02088822  0.0048689  -0.03419824  0.02169391]
    
    Respond with ONLY '0' or '1' and nothing else.
    
LLM Response: {'id': 'chatcmpl-01uitcahk53bmgaw19sy7wq', 'object': 'chat.completion', 'created': 1741367672, 'model': 'mistral-7b-instruct-v0.2', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'length', 'message': {'role': 'assistant', 'content': ' Based on the current state and observation, I'}}], 'usage': {'prompt_tokens': 218, 'completion_tokens': 9, 'total_tokens': 227}, 'system_fingerprint': 'mistral-7b-instruct-v0.2'}
Raw LLM Response: Based on the current state and observation, I
Final 

#### Qwen2.5-14B-DeepSeek-R1-1M

In [61]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
MODEL_NAME = "Qwen2.5-14B-DeepSeek-R1-1M"  # Adjust if needed

In [62]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    prompt = f"""
    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: {observation[0]:.3f} # Min=-4.8 and Max=4.8
        Cart Velocity: {observation[1]:.3f} # Min=-Inf and Max=Inf
        Pole Angle: {observation[2]:.3f} # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: {observation[3]:.3f} # Min=-Inf and Max=Inf
    
    Observation: {observation}
    
    Respond with ONLY '0' or '1' and nothing else.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": "You are a reinforcement learning agent for CartPole."},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10,
        "stop": ["\n"],
    }

    try:
        print(prompt)  # Debugging 
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        print("LLM Response:", response_json)

        # Extract raw response
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        #action = int(raw_result[0]) if raw_result and raw_result[0] in ["0", "1"] else 0
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        raw_result = raw_result.split()[0]  # Ensure we take the first valid word
        action = 1 if raw_result == "1" else 0  # Force a binary output
        
        print(f"Final Action: {action}")
        return action  # ✅ Ensure the function returns an action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0  # Default action in case of failure

In [63]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    observation, info = env.reset()
    
    total_reward = 0
    for step in range(200):  # Maximum steps per episode
        action = query_llm(observation)
        observation, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        env.render()
        if terminated or truncated:
            break  # End episode if the pole falls or max steps reached

        time.sleep(0.05)  # Small delay for better visualization

    env.close()
    print(f"Total reward: {total_reward}")

In [65]:
if __name__ == "__main__":
    run_cartpole()


    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: -0.047 # Min=-4.8 and Max=4.8
        Cart Velocity: -0.016 # Min=-Inf and Max=Inf
        Pole Angle: 0.038 # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: -0.020 # Min=-Inf and Max=Inf
    
    Observation: [-0.04731366 -0.01620284  0.03819104 -0.01988509]
    
    Respond with ONLY '0' or '1' and nothing else.
    
LLM Response: {'id': 'chatcmpl-zt786u9j8wz7ikkeyc6h', 'object': 'chat.completion', 'created': 1741549402, 'model': 'qwen2.5-14b-deepseek-r1-1m', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'length', 'message': {'role': 'assistant', 'content': "Alright, I'm trying to figure out whether"}}], 'usage': {'prompt_tokens': 216, 'completion_tokens': 9, 'total_tokens': 225}, 'system_fingerprint': 'qwen2.5-14b-deepseek-r1-1m'}
Raw LLM Response: Alright, I'm trying to figure out whether
Final Action:

#### Qwen2.5 7b Instruct

In [66]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
MODEL_NAME = "Qwen2.5-7B-Instruct-1M-GGUF"  # Adjust if needed

In [67]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    prompt = f"""
    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: {observation[0]:.3f} # Min=-4.8 and Max=4.8
        Cart Velocity: {observation[1]:.3f} # Min=-Inf and Max=Inf
        Pole Angle: {observation[2]:.3f} # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: {observation[3]:.3f} # Min=-Inf and Max=Inf
    
    Observation: {observation}
    
    Respond with ONLY '0' or '1' and nothing else.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": "You are a reinforcement learning agent for CartPole."},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10,
        "stop": ["\n"],
    }

    try:
        print(prompt)  # Debugging 
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        print("LLM Response:", response_json)

        # Extract raw response
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        #action = int(raw_result[0]) if raw_result and raw_result[0] in ["0", "1"] else 0
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        raw_result = raw_result.split()[0]  # Ensure we take the first valid word
        action = 1 if raw_result == "1" else 0  # Force a binary output
        
        print(f"Final Action: {action}")
        return action  # ✅ Ensure the function returns an action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0  # Default action in case of failure

In [68]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    observation, info = env.reset()
    
    total_reward = 0
    for step in range(200):  # Maximum steps per episode
        action = query_llm(observation)
        observation, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        env.render()
        if terminated or truncated:
            break  # End episode if the pole falls or max steps reached

        time.sleep(0.05)  # Small delay for better visualization

    env.close()
    print(f"Total reward: {total_reward}")

In [69]:
if __name__ == "__main__":
    run_cartpole()


    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: -0.028 # Min=-4.8 and Max=4.8
        Cart Velocity: 0.014 # Min=-Inf and Max=Inf
        Pole Angle: 0.035 # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: 0.023 # Min=-Inf and Max=Inf
    
    Observation: [-0.02828094  0.01398263  0.03481926  0.02295804]
    
    Respond with ONLY '0' or '1' and nothing else.
    
LLM Response: {'id': 'chatcmpl-7hoptx03ue3u4jw4io1ymn', 'object': 'chat.completion', 'created': 1741688516, 'model': 'qwen2.5-7b-instruct-1m', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': '1'}}], 'usage': {'prompt_tokens': 216, 'completion_tokens': 1, 'total_tokens': 217}, 'system_fingerprint': 'qwen2.5-7b-instruct-1m'}
Raw LLM Response: 1
Final Action: 1

    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        

#### Qwen2.5-1.5B-Instruct-GGUF

In [70]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
MODEL_NAME = "Qwen2.5-1.5B-Instruct-GGUF"  # Adjust if needed

In [71]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    prompt = f"""
    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: {observation[0]:.3f} # Min=-4.8 and Max=4.8
        Cart Velocity: {observation[1]:.3f} # Min=-Inf and Max=Inf
        Pole Angle: {observation[2]:.3f} # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: {observation[3]:.3f} # Min=-Inf and Max=Inf
    
    Observation: {observation}
    
    Respond with ONLY '0' or '1' and nothing else.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": "You are a reinforcement learning agent for CartPole."},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10,
        "stop": ["\n"],
    }

    try:
        print(prompt)  # Debugging 
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        print("LLM Response:", response_json)

        # Extract raw response
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        #action = int(raw_result[0]) if raw_result and raw_result[0] in ["0", "1"] else 0
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        raw_result = raw_result.split()[0]  # Ensure we take the first valid word
        action = 1 if raw_result == "1" else 0  # Force a binary output
        
        print(f"Final Action: {action}")
        return action  # ✅ Ensure the function returns an action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0  # Default action in case of failure

In [72]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    observation, info = env.reset()
    
    total_reward = 0
    for step in range(200):  # Maximum steps per episode
        action = query_llm(observation)
        observation, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        env.render()
        if terminated or truncated:
            break  # End episode if the pole falls or max steps reached

        time.sleep(0.05)  # Small delay for better visualization

    env.close()
    print(f"Total reward: {total_reward}")

In [75]:
if __name__ == "__main__":
    run_cartpole()


    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

        Current State:
        Cart Position: -0.007 # Min=-4.8 and Max=4.8
        Cart Velocity: -0.040 # Min=-Inf and Max=Inf
        Pole Angle: -0.045 # Min=-0.418 (-24 degrees) rad and Max=0.418 rad (24 degrees)
        Pole Angular Velocity: -0.022 # Min=-Inf and Max=Inf
    
    Observation: [-0.00696013 -0.04000566 -0.04463327 -0.02249608]
    
    Respond with ONLY '0' or '1' and nothing else.
    
LLM Response: {'id': 'chatcmpl-9a8zpsydp5uqe6sxzywsoi', 'object': 'chat.completion', 'created': 1741722420, 'model': 'qwen2.5-1.5b-instruct', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': '1'}}], 'usage': {'prompt_tokens': 213, 'completion_tokens': 1, 'total_tokens': 214}, 'system_fingerprint': 'qwen2.5-1.5b-instruct'}
Raw LLM Response: 1
Final Action: 1

    Your task is to control a cart in CartPole-v1 to keep the pole balanced.

       