#### Results CartPole 

##### Libraries

In [27]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import requests
import time
import pygame
import re
import json
import matplotlib.pyplot as plt
import pandas as pd

##### Model

In [28]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL  
#MODEL_NAME ="Mistral-Nemo-Instruct-2407-GGUF"
#MODEL_NAME = "Phi-4-mini-instruct-GGUF"
MODEL_NAME ="Qwen2-0.5B-Instruct-GGUF"

In [29]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    
    instruction = """
    You are a reinforcement learning agent for the CartPole-v1 environment.

    Your goal is to keep the pole balanced by choosing the correct action at each time step.

    Each observation is a list of four numbers in this order:
    1. Cart Position (x)
    2. Cart Velocity (x_dot)
    3. Pole Angle (theta)
    4. Pole Angular Velocity (theta_dot)

    Use these values to decide the best action to take:
    - Action 0 = Move cart to the LEFT
    - Action 1 = Move cart to the RIGHT

    Respond with only the action number (0 or 1), and nothing else.
    """

    prompt = f"""
    Observation: {observation}

    Think step-by-step:
    - Is the pole leaning left or right? (Check angle)
    - Is it falling quickly or slowly? (Check angular velocity)
    - Should we move the cart left or right to balance it?

    Then respond with:
    - 0 to move LEFT
    - 1 to move RIGHT

    Just respond with the action number.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": instruction},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.5,
        "max_tokens": 20,
        "stop": ["\n", ".", " "]
    }

    try:
        response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response_json = response.json()

        raw_result = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        action_token = raw_result.split()[0]  # Take first token if model outputs extra text

        # Sanitize and interpret the result
        action = 1 if action_token == "1" else 0  # Default to 0 unless it's exactly "1"

        # Logging
        print(f"Observation: {observation}")
        print(f"LLM Response: {action}")

        return action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0  # Default action in case of failure


In [30]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    EPISODES = 10  # Run multiple episodes
    rewards = []
    time_history = []

    for episode in range(EPISODES):
        observation, info = env.reset()
        total_reward = 0
        start_time = time.time()
        for step in range(200):
            action = query_llm(observation)
            observation, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            if terminated or truncated:
                end_time = time.time()
                elapsed = end_time - start_time
        
                break
        rewards.append(total_reward)
        print(f"Episode {episode+1}: Total Reward = {total_reward}")
    
        time_history.append(elapsed)
        print(f" Elapsed Time: {elapsed:.2f} seconds")

    # Close the environment after all episodes
    env.close()
    
    print("Time History:", time_history)
    avg_time = sum(time_history) / len(time_history)
    print(f"\nAverage Time over {EPISODES} episodes: {avg_time:.2f} seconds")

    
    print("Reward History:", rewards)
    avg_reward = sum(rewards) / len(rewards)
    print(f"\nAverage Reward over {EPISODES} episodes: {avg_reward}")


##### Result Mistral-Nemo-Instruct-2407-GGUF

In [25]:
if __name__ == "__main__":
    run_cartpole()

Observation: [-0.01236809  0.04198546 -0.00319995 -0.02485636]
LLM Response: 0
Observation: [-0.01152838 -0.15309046 -0.00369708  0.26681522]
LLM Response: 1
Observation: [-0.01459019  0.04208406  0.00163922 -0.02703149]
LLM Response: 1
Observation: [-0.01374851  0.23718247  0.00109859 -0.3191968 ]
LLM Response: 0
Observation: [-0.00900486  0.04204489 -0.00528534 -0.0261676 ]
LLM Response: 0
Observation: [-0.00816396 -0.15300088 -0.00580869  0.26484308]
LLM Response: 0
Observation: [-1.1223977e-02 -3.4803945e-01 -5.1183254e-04  5.5568826e-01]
LLM Response: 1
Observation: [-0.01818477 -0.1529103   0.01060193  0.2628441 ]
LLM Response: 0
Observation: [-0.02124297 -0.34818196  0.01585881  0.558852  ]
LLM Response: 1
Observation: [-0.02820661 -0.15328617  0.02703585  0.2712074 ]
LLM Response: 0
Observation: [-0.03127233 -0.34878328  0.03246     0.57229346]
LLM Response: 0
Observation: [-0.038248   -0.54434496  0.04390587  0.87502307]
LLM Response: 1
Observation: [-0.0491349  -0.34984654  0

##### Phi-4-mini-instruct-GGUF

In [20]:
if __name__ == "__main__":
    run_cartpole()

Observation: [-0.01894447  0.02054997  0.01171449  0.00109933]
LLM Response: 0
Observation: [-0.01853347 -0.174738    0.01173647  0.2974552 ]
LLM Response: 0
Observation: [-0.02202823 -0.37002528  0.01768558  0.59381634]
LLM Response: 0
Observation: [-0.02942874 -0.5653903   0.02956191  0.8920173 ]
LLM Response: 0
Observation: [-0.04073654 -0.7609005   0.04740225  1.1938444 ]
LLM Response: 0
Observation: [-0.05595455 -0.95660317  0.07127914  1.5009998 ]
LLM Response: 0
Observation: [-0.07508662 -1.1525147   0.10129914  1.8150592 ]
LLM Response: 0
Observation: [-0.09813691 -1.3486078   0.13760032  2.1374218 ]
LLM Response: 0
Observation: [-0.12510906 -1.5447968   0.18034875  2.469251  ]
LLM Response: 0
Episode 1: Total Reward = 9.0
 Elapsed Time: 26.25 seconds
Observation: [ 0.04190437 -0.00379333  0.03659861 -0.00890986]
LLM Response: 0
Observation: [ 0.04182851 -0.19942053  0.03642041  0.29509202]
LLM Response: 0
Observation: [ 0.0378401  -0.39504227  0.04232225  0.59903526]
LLM Respo

##### Qwen2-0.5B-Instruct-GGUF

In [31]:
if __name__ == "__main__":
    run_cartpole()

Observation: [ 0.01479883 -0.00262138  0.04406992  0.02106171]
LLM Response: 0
Observation: [ 0.0147464  -0.19834672  0.04449115  0.32731703]
LLM Response: 0
Observation: [ 0.01077947 -0.39407292  0.05103749  0.6336919 ]
LLM Response: 0
Observation: [ 0.00289801 -0.5898683   0.06371133  0.94200104]
LLM Response: 0
Observation: [-0.00889936 -0.78578824  0.08255135  1.2540032 ]
LLM Response: 0
Observation: [-0.02461512 -0.9818647   0.10763142  1.5713588 ]
LLM Response: 0
Observation: [-0.04425241 -1.1780939   0.13905859  1.8955818 ]
LLM Response: 0
Observation: [-0.06781429 -1.3744226   0.17697023  2.2279837 ]
LLM Response: 0
Episode 1: Total Reward = 8.0
 Elapsed Time: 24.93 seconds
Observation: [ 0.02316907  0.0154818  -0.00691485  0.03421474]
LLM Response: 0
Observation: [ 0.0234787  -0.17954032 -0.00623055  0.32470796]
LLM Response: 0
Observation: [ 1.9887896e-02 -3.7457299e-01  2.6360783e-04  6.1541957e-01]
LLM Response: 0
Observation: [ 0.01239644 -0.56969863  0.012572    0.9081855