#### Results CartPole 

##### Libraries

In [13]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import requests
import time
import pygame
import re
import json
import matplotlib.pyplot as plt
import pandas as pd

##### Model

In [48]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
#MODEL_NAME = "Qwen2.5-7B-Instruct-1M-GGUF"  
#MODEL_NAME ="Mistral-Nemo-Instruct-2407-GGUF"
#MODEL_NAME = "Phi-4-mini-instruct-GGUF"
#MODEL_NAME = "deepseek-llm-7B-chat-GGUF"
#MODEL_NAME ="Qwen2-0.5B-Instruct-GGUF"
MODEL_NAME = "Qwen2.5-1.5B-Instruct-GGUF"

In [49]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    
    instruction= f"""
        You are a reinforcement learning agent for CartPole. Your task is to control a cart in CartPole to keep the pole balanced.
    """
    
    prompt = f"""
    
            Given the following CartPole state, decide the optimal action:

            Observation: {observation}

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": instruction},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10, 
        "stop": ["\n", ".", " "],
    }

    try:
        print(prompt)  # Debugging 
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        print("LLM Response:", response_json)

        # Extract raw response
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        raw_result = raw_result.split()[0]  # Ensure we take the first valid word
        action = 1 if raw_result == "1" else 0  # Force a binary output
        
        print(f"Final Action: {action}")
        return action  # Ensure the function returns an action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0  # Default action in case of failure

In [50]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    EPISODES = 10  # Run multiple episodes
    rewards = []

    for episode in range(EPISODES):
        observation, info = env.reset()
        total_reward = 0
        for step in range(200):
            action = query_llm(observation)
            observation, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            if terminated or truncated:
                break
        rewards.append(total_reward)
        print(f"Episode {episode+1}: Total Reward = {total_reward}")

    # Close the environment after all episodes
    env.close()
    
    print("Reward History:", rewards)
    avg_reward = sum(rewards) / len(rewards)
    print(f"\nAverage Reward over {EPISODES} episodes: {avg_reward}")


##### Result Qwen2.5-7B-Instruct-1M-GGUF

In [None]:
if __name__ == "__main__":
    run_cartpole()


    
            Given the following CartPole state, decide the optimal action:

            Observation: [-0.03420064  0.04103522  0.00607424  0.00414078]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-4po9llwld8gfoxg4m5fb9', 'object': 'chat.completion', 'created': 1742241684, 'model': 'qwen2.5-7b-instruct-1m', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': '1'}}], 'usage': {'prompt_tokens': 130, 'completion_tokens': 1, 'total_tokens': 131}, 'system_fingerprint': 'qwen2.5-7b-instruct-1m'}
Raw LLM Response: 1
Final Action: 1

    
            Given the following CartPole state, decide the optimal action:

            Observation: [-0.03337994  0.23606953  0.00615706 -0.28661948]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-gg0l7o87wjt3rbi028r4ca', 'object': 'chat.completion', 'created': 

: 

##### Result Mistral-Nemo-Instruct-2407-GGUF

In [90]:
if __name__ == "__main__":
    run_cartpole()


    
            Given the following CartPole state, decide the optimal action:

            Observation: [0.02349716 0.00533033 0.02967248 0.00565439]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-qxz4xlxcrzikhiqs86zx', 'object': 'chat.completion', 'created': 1742241052, 'model': 'mistral-nemo-instruct-2407', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': 'Based'}}], 'usage': {'prompt_tokens': 121, 'completion_tokens': 1, 'total_tokens': 122}, 'system_fingerprint': 'mistral-nemo-instruct-2407'}
Raw LLM Response: Based
Final Action: 0

    
            Given the following CartPole state, decide the optimal action:

            Observation: [ 0.02360376 -0.19020432  0.02978557  0.3075497 ]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-tx0sat4rvzq4kifk5qtfb', 'object': 'chat.completion', '

##### Phi-4-mini-instruct-GGUF

In [86]:
if __name__ == "__main__":
    run_cartpole()


    
            Given the following CartPole state, decide the optimal action:

            Observation: [-0.01366367  0.01126581 -0.03066746  0.03633359]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-1i8gn9ds5eeomqu25v8c1o', 'object': 'chat.completion', 'created': 1742240626, 'model': 'phi-4-mini-instruct', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': 'Based'}}], 'usage': {'prompt_tokens': 100, 'completion_tokens': 1, 'total_tokens': 101}, 'system_fingerprint': 'phi-4-mini-instruct'}
Raw LLM Response: Based
Final Action: 0

    
            Given the following CartPole state, decide the optimal action:

            Observation: [-0.01343836 -0.18340324 -0.02994079  0.31918505]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-cbz8kd3nwmselde5242q', 'object': 'chat.completion', 'created':

##### deepseek-llm-7B-chat-GGUF

In [82]:
if __name__ == "__main__":
    run_cartpole()


    
            Given the following CartPole state, decide the optimal action:

            Observation: [ 0.00971532 -0.04946095  0.04153443  0.02915428]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-4jd14f4smj4h1lx8mozyll', 'object': 'chat.completion', 'created': 1742231536, 'model': 'deepseek-llm-7b-chat', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': 'The'}}], 'usage': {'prompt_tokens': 144, 'completion_tokens': 1, 'total_tokens': 145}, 'system_fingerprint': 'deepseek-llm-7b-chat'}
Raw LLM Response: The
Final Action: 0

    
            Given the following CartPole state, decide the optimal action:

            Observation: [ 0.0087261  -0.24515316  0.04211751  0.33464706]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-ip4g3hifttkvxgzz62px8', 'object': 'chat.completion', 'created': 

##### Qwen2-0.5B-Instruct-GGUF

In [47]:
if __name__ == "__main__":
    run_cartpole()


    
            Given the following CartPole state, decide the optimal action:

            Observation: [-0.02544669 -0.04006143  0.02590476  0.00880475]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-ljll65sjh7itpqosdjpmqn', 'object': 'chat.completion', 'created': 1742388689, 'model': 'qwen2-0.5b-instruct', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': '0'}}], 'usage': {'prompt_tokens': 129, 'completion_tokens': 1, 'total_tokens': 130}, 'system_fingerprint': 'qwen2-0.5b-instruct'}
Raw LLM Response: 0
Final Action: 0

    
            Given the following CartPole state, decide the optimal action:

            Observation: [-0.02624792 -0.23554513  0.02608085  0.30954707]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-4bdhdqjnathusi95jwj5ug', 'object': 'chat.completion', 'created': 17423

##### Qwen2.5-1.5B-Instruct-GGUF

In [51]:
if __name__ == "__main__":
    run_cartpole()


    
            Given the following CartPole state, decide the optimal action:

            Observation: [-0.03587938 -0.02144035 -0.01114344 -0.00821782]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-lddiwec72krsleo3q728nh', 'object': 'chat.completion', 'created': 1742389264, 'model': 'qwen2.5-1.5b-instruct', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': '1'}}], 'usage': {'prompt_tokens': 127, 'completion_tokens': 1, 'total_tokens': 128}, 'system_fingerprint': 'qwen2.5-1.5b-instruct'}
Raw LLM Response: 1
Final Action: 1

    
            Given the following CartPole state, decide the optimal action:

            Observation: [-0.03630819  0.17383963 -0.0113078  -0.3043957 ]

            Respond with either '0' (Left) or '1' (Right), and nothing else.
    
LLM Response: {'id': 'chatcmpl-w9b8v9iq3gphluzlhypzi9', 'object': 'chat.completion', 'created': 1

#### Results with CoT prompt

In [52]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
#MODEL_NAME = "Qwen2.5-7B-Instruct-1M-GGUF"  
#MODEL_NAME ="Mistral-Nemo-Instruct-2407-GGUF"
#MODEL_NAME = "Phi-4-mini-instruct-GGUF"
#MODEL_NAME = "deepseek-llm-7B-chat-GGUF"
#MODEL_NAME ="Qwen2-0.5B-Instruct-GGUF"
MODEL_NAME="Qwen2.5-1.5B-Instruct-GGUF"

In [53]:
instruction= f"""
        You are a reinforcement learning agent controlling a cart in the CartPole-v1 environment. 
        Your goal is to keep the pole balanced by choosing the optimal action.

        ## Rules:
        - If the pole **leans right** (positive pole angle), moving **right (1)** may help balance it.
        - If the pole **leans left** (negative pole angle), moving **left (0)** may help balance it.
        - If the pole is **falling right** (positive pole angular velocity), move **right (1)** to counteract.
        - If the pole is **falling left** (negative pole angular velocity), move **left (0)** to counteract.
        - If the cart is **moving too fast right**, move **left (0)** to slow it down.
        - If the cart is **moving too fast left**, move **right (1)** to slow it down.

        ## Examples:

        Example 1:
        State:
        Cart Position:  0.100
        Cart Velocity: -0.020
        Pole Angle:     0.050
        Pole Angular Velocity: -0.030
        Reasoning:
        - The pole is leaning **right** (0.050) but falling **left** (-0.030).
        - To stabilize, move **right** (1).
        Response: 1

        Example 2:
        State:
        Cart Position: -0.100
        Cart Velocity:  0.030
        Pole Angle:    -0.060
        Pole Angular Velocity:  0.040
        Reasoning:
        - The pole is leaning **left** (-0.060) and falling **right** (0.040).
        - To stabilize, move **left** (0).
        Response: 0

        ## Response Format:
        - Respond only with **'0' or '1'**.
        - Do **not** include explanations.
    """




In [54]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    
    prompt = f"""

        Now given the following state:
        Cart Position: {observation[0]:.3f}
        Cart Velocity: {observation[1]:.3f}
        Pole Angle: {observation[2]:.3f}
        Pole Angular Velocity: {observation[3]:.3f}
            
            Respond only with '0' or '1'.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": instruction},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10, 
        "stop": ["\n", ".", " "],
    }

    try:
        #print(prompt)  # Debugging 
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        #print("LLM Response:", response_json)

        # Extract raw response
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        raw_result = raw_result.split()[0]  # Ensure we take the first valid word
        action = 1 if raw_result == "1" else 0  # Force a binary output
        
        print(f"Final Action: {action}")
        return action  # Ensure the function returns an action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0  # Default action in case of failure

In [55]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    
    EPISODES = 10  # Run multiple episodes
    rewards = []

    for episode in range(EPISODES):
        observation, info = env.reset()
        total_reward = 0
        for step in range(200):
            action = query_llm(observation)
            observation, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            if terminated or truncated:
                break
        
        rewards.append(total_reward)
        print(f"Episode {episode+1}: Total Reward = {total_reward}")

    # Close the environment after all episodes
    env.close()    

    # Print the rewards and average rewards 
    print("Reward History:", rewards)
    avg_reward = sum(rewards) / len(rewards)
    print(f"\nAverage Reward over {EPISODES} episodes: {avg_reward}")


##### Qwen2.5-7B-Instruct-1M-GGUF

In [10]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Ac

##### Mistral-Nemo-Instruct-2407-GGU

In [15]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Episode 1: Total Reward = 19.0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action:

##### Phi-4-mini-instruct-GGUF

In [25]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Episode 1: Total Reward = 12.0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action:

##### deepseek-llm-7B-chat-GGUF

In [12]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Episode 1: Total Reward = 9.0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Episode 2: Total Reward = 9.0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw 

##### Qwen2-0.5B-Instruct-GGUF

In [43]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Episode 1: Total Reward = 13.0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Episode 2: Total Reward = 11.0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Ra

##### Qwen2.5-1.5B-Instruct-GGUF

In [56]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Episode 1: Total Reward = 8.0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Episode 2: Total Reward = 10.0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw

#### Results Zero Shot CoT

In [57]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
#MODEL_NAME = "Qwen2.5-7B-Instruct-1M-GGUF"  
#MODEL_NAME ="Mistral-Nemo-Instruct-2407-GGUF"
#MODEL_NAME = "Phi-4-mini-instruct-GGUF"
#MODEL_NAME = "deepseek-llm-7B-chat-GGUF"
#MODEL_NAME ="Qwen2-0.5B-Instruct-GGUF"
MODEL_NAME="Qwen2.5-1.5B-Instruct-GGUF"

In [58]:
instruction= f"""
        You are a reinforcement learning agent controlling a cart in the CartPole-v1 environment. 
        Your goal is to keep the pole balanced by choosing the optimal action.

    """


In [59]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    
    prompt = f"""

        Given the following state:
        Cart Position: {observation[0]:.3f}
        Cart Velocity: {observation[1]:.3f}
        Pole Angle: {observation[2]:.3f}
        Pole Angular Velocity: {observation[3]:.3f}
            
        Respond only with '0' or '1'.
        
        Let's think step by step
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": instruction},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10, 
        "stop": ["\n", ".", " "],
    }

    try:
        #print(prompt)  # Debugging 
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()
        #print("LLM Response:", response_json)

        # Extract raw response
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        print(f"Raw LLM Response: {raw_result}")

        # Extract the first valid action (0 or 1)
        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        raw_result = raw_result.split()[0]  # Ensure we take the first valid word
        action = 1 if raw_result == "1" else 0  # Force a binary output
        
        print(f"Final Action: {action}")
        return action  # Ensure the function returns an action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0  # Default action in case of failure

In [60]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    
    EPISODES = 10  # Run multiple episodes
    rewards = []

    for episode in range(EPISODES):
        observation, info = env.reset()
        total_reward = 0
        for step in range(200):
            action = query_llm(observation)
            observation, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            if terminated or truncated:
                break
        
        rewards.append(total_reward)
        print(f"Episode {episode+1}: Total Reward = {total_reward}")

    # Close the environment after all episodes
    env.close()    

    # Print the rewards and average rewards 
    print("Reward History:", rewards)
    avg_reward = sum(rewards) / len(rewards)
    print(f"\nAverage Reward over {EPISODES} episodes: {avg_reward}")


##### Qwen2.5-7B-Instruct-1M-GGUF

In [18]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Episode 1: Total Reward = 17.0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action:

##### Mistral-Nemo-Instruct-2407-GGUF

In [23]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: -
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Given
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Episode 1: Total Reward = 10.0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Given
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Episode 2: Total Reward = 9.0
Raw LLM Response: Given
Final Action: 0
Raw LLM Response: Given
Final Action: 0
Raw LLM Response: Given
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: Based
Final Ac

##### Phi-4-mini-instruct-GGUF

In [28]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: To
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: In
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: In
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: In
Final Action: 0
Raw LLM Response: In
Final Action: 0
Episode 1: Total Reward = 10.0
Raw LLM Response: In
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: In
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: In
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: In
Final Action: 0
Episode 2: Total Reward = 9.0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: To
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: In
Final Action: 0
Raw LLM Response: In
Final Action: 0
Raw LLM Response: In
Final Action: 0
Raw LLM 

##### deepseek-llm-7B-chat-GGUF

In [33]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Episode 1: Total Reward = 9.0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Since
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Given
Final Action: 0
Episode 2: Total Reward = 9.0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Based
Final Action: 0
Raw LLM Response: Ba

##### Qwen2-0.5B-Instruct-GGUF

In [38]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Episode 1: Total Reward = 14.0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Episode 2: Total Reward = 9.0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw

##### Qwen2.5-1.5B-Instruct-GGUF

In [61]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Episode 1: Total Reward = 10.0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: 0
Raw LLM Response: 1
Final Action: 1
Raw LLM Response: 0
Final Action: