#### Results Zero Shot with modified instruction

##### Import libraries

In [15]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import requests
import time
import pygame
import re
import json
import matplotlib.pyplot as plt
import pandas as pd

##### Selected models

In [16]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL 
#MODEL_NAME ="Mistral-Nemo-Instruct-2407-GGUF"
MODEL_NAME = "Phi-4-mini-instruct-GGUF"

##### Creating the instruction

In [17]:
instruction= f"""
        You are a reinforcement learning agent controlling a cart in the CartPole-v1 environment. 
        Your goal is to keep the pole balanced by choosing the optimal action.

        ## Rules:
        - If the pole **leans right** (positive pole angle), moving **right (1)** may help balance it.
        - If the pole **leans left** (negative pole angle), moving **left (0)** may help balance it.
        - If the pole is **falling right** (positive pole angular velocity), move **right (1)** to counteract.
        - If the pole is **falling left** (negative pole angular velocity), move **left (0)** to counteract.
        - If the cart is **moving too fast right**, move **left (0)** to slow it down.
        - If the cart is **moving too fast left**, move **right (1)** to slow it down.
        - If the cart is **stationary**, the pole's angle and angular velocity are more important than the cart's position.
        
        ## Response Format:
        - Respond only with **'0' or '1'**.
        - Do **not** include explanations.
    """

##### Creating the prompt en query to LLM

In [18]:
def query_llm(observation):
    """Sends observation to the LLM and retrieves an action (0 or 1)."""
    
    prompt = f"""

        Now given the following state:
        Cart Position: {observation[0]:.3f}
        Cart Velocity: {observation[1]:.3f}
        Pole Angle: {observation[2]:.3f}
        Pole Angular Velocity: {observation[3]:.3f}
            
        Think step-by-step:
            - Is the pole leaning left or right? (Check angle)
            - Is it falling quickly or slowly? (Check angular velocity)
            - Should we move the cart left or right to balance it?

    Then respond with:
    - 0 to move LEFT
    - 1 to move RIGHT

    Just respond with the action number.
    """

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": instruction},
                     {"role": "user", "content": prompt}],
        "temperature": 0.5,
        "max_tokens": 10, 
        "stop": ["\n", ".", " "],
    }

    try:
        # This code crashed when it took too long to respond, so we added a timeout and retry logic
        #response = requests.post(LLM_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        
        # Add timeout and retry logic here to get a more robust response
        try:
            response = requests.post(
                LLM_API_URL,
                headers={"Content-Type": "application/json"},
                data=json.dumps(payload),
                timeout=20  # ⏱ Initial timeout (seconds)
            )
        except requests.exceptions.Timeout:
            print("Timeout occurred — retrying once with longer timeout...")
            response = requests.post(
                LLM_API_URL,
                headers={"Content-Type": "application/json"},
                data=json.dumps(payload),
                timeout=40  # ⏱ Retry with longer timeout
            )
        
        
        
        response_json = response.json()

        # Safely extract response content
        content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "")
        raw_result = content.strip()

        print(f"Raw LLM Response: '{raw_result}'")

        if not raw_result:
            print("Warning: Empty response from LLM.")
            return 0  # Default fallback action

        # Extract the first word only if response is not empty
        first_token = raw_result.split()[0]
        action = 1 if first_token == "1" else 0

        print(f"Final Action: {action}")
        return action

    except Exception as e:
        print("Error processing LLM response:", e)
        return 0

##### Running the experiment

In [19]:
def run_cartpole():
    """Runs the CartPole-v1 environment with the LLM-based agent."""
    env = gym.make("CartPole-v1", render_mode="human")  # Change render_mode to 'None' if no visualization needed
    EPISODES = 50  # Run multiple episodes
    rewards = []
    time_history = []

    for episode in range(EPISODES):
        observation, info = env.reset()
        total_reward = 0
        start_time = time.time()
        for step in range(200):
            action = query_llm(observation)
            observation, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            if terminated or truncated:
                end_time = time.time()
                elapsed = end_time - start_time
        
                break
        rewards.append(total_reward)
        print(f"Episode {episode+1}: Total Reward = {total_reward}")
    
        time_history.append(elapsed)
        print(f" Elapsed Time: {elapsed:.2f} seconds")

    # Close the environment after all episodes
    env.close()
    
    print("Time History:", time_history)
    avg_time = sum(time_history) / len(time_history)
    print(f"\nAverage Time over {EPISODES} episodes: {avg_time:.2f} seconds")

    
    print("Reward History:", rewards)
    avg_reward = sum(rewards) / len(rewards)
    print(f"\nAverage Reward over {EPISODES} episodes: {avg_reward}")


##### Result Mistral NeMo (10 episodes)

In [6]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Resp

##### Result Phi-4 mini (10 episodes)

In [8]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Episode 1: Total Reward = 8.0
 Elapsed Time: 23.42 seconds
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Episode 2: Total Reward = 9.0
 Elapsed Time: 24.87 seconds
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM 

##### Result Mistral NeMo (50 episodes)

In [14]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '0'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Resp

##### Result Phi-4-Mini (50 episodes)

In [None]:
if __name__ == "__main__":
    run_cartpole()

Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '-'
Final Action: 0
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Episode 1: Total Reward = 11.0
 Elapsed Time: 32.56 seconds
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Episode 2: Total Reward = 9.0
 Elapsed Time: 25.63 seconds
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM Response: '1'
Final Action: 1
Raw LLM

: 