In [4]:
import sounddevice as sd
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration

def record_audio(duration, sample_rate=16000):
    """
    Record audio from microphone
    
    Args:
        duration (int): Recording duration in seconds
        sample_rate (int): Audio sample rate, default is 16000Hz which Whisper expects
    
    Returns:
        numpy.ndarray: Recorded audio array
    """
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()  # Wait until recording is finished
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """
    Transcribe audio using Whisper model
    
    Args:
        audio_array (numpy.ndarray): Audio array to transcribe
        sample_rate (int): Sample rate of the audio
    
    Returns:
        str: Transcribed text
    """
    # Load model and processor
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    # Process audio
    input_features = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt"
    ).input_features
    
    # Generate tokens and decode to text
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    
    return transcription[0]

def main():
    # Set recording duration (in seconds)
    DURATION = 5
    SAMPLE_RATE = 16000
    
    try:
        # Record audio
        audio_array = record_audio(DURATION, SAMPLE_RATE)
        
        # Transcribe audio
        print("Transcribing...")
        text = transcribe_audio(audio_array, SAMPLE_RATE)
        
        print("\nTranscription:")
        print(text)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [5]:
main()

Recording for 5 seconds...
Recording finished!
Transcribing...


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Transcription:
 Hi, what's up? How's it going?


In [2]:
import sounddevice as sd
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration

In [3]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [9]:
import sounddevice as sd
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration

def record_audio(duration, sample_rate=16000):
    """
    Record audio from microphone
    
    Args:
        duration (int): Recording duration in seconds
        sample_rate (int): Audio sample rate, default is 16000Hz which Whisper expects
    
    Returns:
        numpy.ndarray: Recorded audio array
    """
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()  # Wait until recording is finished
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """
    Transcribe audio using Whisper model
    
    Args:
        audio_array (numpy.ndarray): Audio array to transcribe
        sample_rate (int): Sample rate of the audio
    
    Returns:
        str: Transcribed text
    """
    # Load model and processor
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    # Process audio
    # Process audio with explicit language setting and attention mask
    inputs = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt",
        return_attention_mask=True
    )
    input_features = inputs.input_features
    attention_mask = inputs.attention_mask
    
    # Generate tokens and decode to text
    # Generate with explicit language setting and attention mask
    predicted_ids = model.generate(
        input_features,
        attention_mask=attention_mask,
        language="en",  # Set English explicitly
        task="transcribe"  # Explicitly set to transcription task
    )
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    
    return transcription[0]

In [11]:
DURATION = 5
SAMPLE_RATE = 16000

try:
    # Record audio
    audio_array = record_audio(DURATION, SAMPLE_RATE)
    
    # Transcribe audio
    print("Transcribing...")
    text = transcribe_audio(audio_array, SAMPLE_RATE)
    
    print("\nTranscription:")
    print(text)
    
except Exception as e:
    print(f"An error occurred: {str(e)}")

Recording for 5 seconds...
Recording finished!
Transcribing...

Transcription:
 Turn on the lamp.


In [12]:
text

' Turn on the lamp.'

In [6]:
import sounddevice as sd
import numpy as np
import soundfile as sf
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from kokoro import KPipeline
from IPython.display import Audio

def record_audio(duration, sample_rate=16000):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """Transcribe audio using Whisper"""
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    # Process audio with explicit language setting and attention mask
    inputs = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    # Generate with explicit settings
    predicted_ids = model.generate(
        inputs.input_features,
        attention_mask=inputs.attention_mask,
        language="en",
        task="transcribe"
    )
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def speak_text(text):
    """Convert text to speech using Kokoro"""
    # Initialize Kokoro pipeline
    pipeline = KPipeline(lang_code='a')  # American English
    
    # Generate audio
    generator = pipeline(
        text,
        voice='af_heart',
        speed=1,
        split_pattern=r'\n+'
    )
    
    # Get the first (and only) audio segment
    for _, _, audio in generator:
        return Audio(data=audio, rate=24000, autoplay=True)

def main():
    try:
        # Record audio (5 seconds)
        audio_array = record_audio(5)
        
        # Transcribe the audio
        print("Transcribing...")
        transcribed_text = transcribe_audio(audio_array)
        print("\nTranscription:", transcribed_text)
        
        # Convert transcription to speech and play it
        print("\nGenerating speech...")
        audio_output = speak_text(transcribed_text)
        
        # Display the audio widget (if in Jupyter notebook)
        try:
            from IPython.display import display
            display(audio_output)
        except:
            print("Unable to display audio widget. Are you running in a Jupyter notebook?")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Recording for 5 seconds...
Recording finished!
Transcribing...

Transcription:  Hi, what's going on? I'm Irma. How are you?

Generating speech...


  WeightNorm.apply(module, name, dim)


af_heart.pt:   0%|          | 0.00/523k [00:00<?, ?B/s]

In [3]:
pipeline = KPipeline(lang_code='a') # <= make sure lang_code matches voice

In [5]:
# Generate audio
generator = pipeline(
    "Hi, what's going on",
    voice='af_heart',
    speed=1,
    split_pattern=r'\n+'
)

In [8]:
import sounddevice as sd
import numpy as np
import soundfile as sf
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from kokoro import KPipeline
from IPython.display import Audio

def record_audio(duration, sample_rate=16000):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """Transcribe audio using Whisper"""
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    # Process audio with explicit language setting and attention mask
    inputs = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    # Generate with explicit settings
    predicted_ids = model.generate(
        inputs.input_features,
        attention_mask=inputs.attention_mask,
        language="en",
        task="transcribe"
    )
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def speak_text(text):
    """Convert text to speech using Kokoro and play it immediately"""
    # Initialize Kokoro pipeline
    pipeline = KPipeline(lang_code='a')  # American English
    
    # Generate audio
    generator = pipeline(
        text,
        voice='af_heart',
        speed=1,
        split_pattern=r'\n+'
    )
    
    # Get the first (and only) audio segment and play it immediately
    for _, _, audio in generator:
        # Play audio directly using sounddevice
        sd.play(audio, samplerate=24000)
        sd.wait()  # Wait until audio is finished playing

def main():
    try:
        # Record audio (5 seconds)
        audio_array = record_audio(5)
        
        # Transcribe the audio
        print("Transcribing...")
        transcribed_text = transcribe_audio(audio_array)
        print("\nTranscription:", transcribed_text)
        
        # Convert transcription to speech and play it automatically
        print("\nGenerating and playing speech...")
        speak_text(transcribed_text)
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Recording for 5 seconds...
Recording finished!
Transcribing...

Transcription:  Please drive slowly.

Generating and playing speech...


In [13]:
import sounddevice as sd
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from kokoro import KPipeline
from langchain_community.llms import Ollama
import json

class DrivingCommands:
    def __init__(self):
        self.throttle = 0
        self.brake = 0
        self.steer = 0
        self.left_signal = 0
        self.right_signal = 0

    def to_dict(self):
        return {
            "Throttle": self.throttle,
            "Brake": self.brake,
            "Steer": self.steer,
            "LeftSignal": self.left_signal,
            "RightSignal": self.right_signal
        }

def record_audio(duration, sample_rate=16000):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """Transcribe audio using Whisper"""
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    inputs = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    predicted_ids = model.generate(
        inputs.input_features,
        attention_mask=inputs.attention_mask,
        language="en",
        task="transcribe"
    )
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def speak_text(text):
    """Convert text to speech using Kokoro and play it immediately"""
    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
        text,
        voice='af_heart',
        speed=1,
        split_pattern=r'\n+'
    )
    
    for _, _, audio in generator:
        sd.play(audio, samplerate=24000)
        sd.wait()

def extract_driving_commands(text):
    """Extract driving commands using Ollama"""
    llm = Ollama(model="deepseek-r1:latest")
    
    system_prompt = """
    You are a driving command interpreter. Convert natural language commands into specific control values.
    Output should be JSON with these possible fields (only include relevant ones):
    - Throttle (0-100)
    - Brake (0-100)
    - Steer (-45 to 45, negative for left, positive for right)
    - LeftSignal (0 or 1)
    - RightSignal (0 or 1)

    Examples:
    "drive slowly" -> {"Throttle": 20}
    "brake immediately" -> {"Brake": 100}
    "you're too far left" -> {"Steer": 10}
    "turn on left signal" -> {"LeftSignal": 1}
    "drive at 50 kph" -> {"Throttle": 50}
    "brake 75 percent" -> {"Brake": 75}
    "steer 20 degrees right" -> {"Steer": 20}
    
    Only output the JSON, nothing else.
    THINK FAST, DON'T OVERTHINK IT!
    """
    
    prompt = f"Convert this driving command to control values: '{text}'"
    
    response = llm.invoke(system_prompt + "\n" + prompt)
    
    try:
        # Try to find JSON-like structure in the response
        start_idx = response.find('{')
        end_idx = response.rfind('}') + 1
        if start_idx >= 0 and end_idx > start_idx:
            json_str = response[start_idx:end_idx]
            return json.loads(json_str)
        return {}
    except:
        print("Error parsing LLM response")
        return {}

def main():
    try:
        # Record audio (5 seconds)
        audio_array = record_audio(5)
        
        # Transcribe the audio
        print("Transcribing...")
        transcribed_text = transcribe_audio(audio_array)
        print("\nTranscription:", transcribed_text)
        
        # Extract driving commands
        print("\nExtracting driving commands...")
        commands = extract_driving_commands(transcribed_text)
        print("Driving commands:", commands)
        
        # Generate confirmation speech
        confirmation = "Executing commands: "
        for key, value in commands.items():
            confirmation += f"{key} {value}, "
        
        # Speak the confirmation
        print("\nGenerating and playing speech...")
        speak_text(confirmation)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Recording for 5 seconds...
Recording finished!
Transcribing...

Transcription:  10 degrees to practice

Extracting driving commands...
Error parsing LLM response
Driving commands: {}

Generating and playing speech...


  WeightNorm.apply(module, name, dim)


In [14]:
llm = Ollama(model="deepseek-r1:latest")
text = "10 degrees left"
    
system_prompt = """
You are a driving command interpreter. Convert natural language commands into specific control values.
Output should be JSON with these possible fields (only include relevant ones):
- Throttle (0-100)
- Brake (0-100)
- Steer (-45 to 45, negative for left, positive for right)
- LeftSignal (0 or 1)
- RightSignal (0 or 1)

Examples:
"drive slowly" -> {"Throttle": 20}
"brake immediately" -> {"Brake": 100}
"you're too far left" -> {"Steer": 10}
"turn on left signal" -> {"LeftSignal": 1}
"drive at 50 kph" -> {"Throttle": 50}
"brake 75 percent" -> {"Brake": 75}
"steer 20 degrees right" -> {"Steer": 20}

Only output the JSON, nothing else.
THINK FAST, DON'T OVERTHINK IT!
"""

prompt = f"Convert this driving command to control values: '{text}'"

response = llm.invoke(system_prompt + "\n" + prompt)

In [15]:
response

'<think>\nOkay, so I need to figure out how to convert the command "10 degrees left" into the appropriate control values. Let me break it down step by step.\n\nFirst, looking at the examples provided, I see that commands related to steering are mapped to the "Steer" field, which ranges from -45 to 45. Positive values mean turning right, and negative values mean turning left. \n\nIn this case, the command is "10 degrees left." The key here is "left," which indicates a turn to the left. Since it\'s specified as an angle of 10 degrees, I should represent that numerically. Because left turns are indicated by negative values in the Steer field, -10 would make sense.\n\nI don\'t see any mention of throttle or brake in this command, so those fields can remain at 0. Similarly, there\'s no indication of signals (left or right), so LeftSignal and RightSignal should both be 0.\n\nPutting it all together, the JSON output should have "Steer" set to -10 and the other fields as 0 where applicable.\n<

In [17]:
import sounddevice as sd
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from kokoro import KPipeline
from langchain_community.llms import Ollama
import json

class DrivingCommands:
    def __init__(self):
        self.throttle = 0
        self.brake = 0
        self.steer = 0
        self.left_signal = 0
        self.right_signal = 0

    def to_dict(self):
        return {
            "Throttle": self.throttle,
            "Brake": self.brake,
            "Steer": self.steer,
            "LeftSignal": self.left_signal,
            "RightSignal": self.right_signal
        }

def record_audio(duration, sample_rate=16000):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """Transcribe audio using Whisper"""
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    inputs = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    predicted_ids = model.generate(
        inputs.input_features,
        attention_mask=inputs.attention_mask,
        language="en",
        task="transcribe"
    )
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def speak_text(text):
    """Convert text to speech using Kokoro and play it immediately"""
    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
        text,
        voice='af_heart',
        speed=1,
        split_pattern=r'\n+'
    )
    
    for _, _, audio in generator:
        sd.play(audio, samplerate=24000)
        sd.wait()

def extract_driving_commands(text):
    """Extract driving commands using Ollama"""
    llm = Ollama(model="deepseek-r1:latest")
    
    system_prompt = """You are a driving command interpreter. Your task is to output ONLY a JSON object with driving control values.

    Rules:
    1. Output ONLY valid JSON, no explanations or thinking
    2. Use these fields only when relevant:
       - Throttle (0-100)
       - Brake (0-100)
       - Steer (-45 to 45, negative for left, positive for right)
       - LeftSignal (0 or 1)
       - RightSignal (0 or 1)
    3. NEVER include any text before or after the JSON
    4. Process commands instantly without overthinking

    Examples:
    Input: "drive slowly" 
    Output: {"Throttle": 20}

    Input: "brake immediately"
    Output: {"Brake": 100}

    Input: "10 degrees left"
    Output: {"Steer": -10}

    Input: "turn on left signal"
    Output: {"LeftSignal": 1}
    """
    
    prompt = f"Convert this driving command to control values: '{text}'"
    
    response = llm.invoke(system_prompt + "\n" + prompt)
    
    try:
        # Clean up the response by finding the JSON part
        response_text = response.strip()
        
        # If response contains any JSON-like structure, extract it
        if '{' in response_text and '}' in response_text:
            start_idx = response_text.find('{')
            end_idx = response_text.rfind('}') + 1
            json_str = response_text[start_idx:end_idx]
            
            # Parse the JSON and validate the values
            parsed = json.loads(json_str)
            
            # Ensure values are in correct ranges
            if 'Throttle' in parsed:
                parsed['Throttle'] = max(0, min(100, parsed['Throttle']))
            if 'Brake' in parsed:
                parsed['Brake'] = max(0, min(100, parsed['Brake']))
            if 'Steer' in parsed:
                parsed['Steer'] = max(-45, min(45, parsed['Steer']))
            if 'LeftSignal' in parsed:
                parsed['LeftSignal'] = 1 if parsed['LeftSignal'] else 0
            if 'RightSignal' in parsed:
                parsed['RightSignal'] = 1 if parsed['RightSignal'] else 0
                
            return parsed
        
        print("No valid JSON found in response:", response_text)
        return {}
    except Exception as e:
        print(f"Error parsing LLM response: {str(e)}")
        print("Raw response:", response_text)
        return {}

def main():
    try:
        # Record audio (5 seconds)
        audio_array = record_audio(5)
        
        # Transcribe the audio
        print("Transcribing...")
        transcribed_text = transcribe_audio(audio_array)
        print("\nTranscription:", transcribed_text)
        
        # Extract driving commands
        print("\nExtracting driving commands...")
        commands = extract_driving_commands(transcribed_text)
        print("Driving commands:", commands)
        
        # Generate confirmation speech
        confirmation = "Executing commands: "
        for key, value in commands.items():
            confirmation += f"{key} {value}, "
        
        # Speak the confirmation
        print("\nGenerating and playing speech...")
        speak_text(confirmation)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Recording for 5 seconds...
Recording finished!
Transcribing...

Transcription:  Drive 10 degrees to left.

Extracting driving commands...
Error parsing LLM response: Extra data: line 1 column 17 (char 16)
Raw response: <think>
Alright, let's tackle this problem. The user has given me a command and wants me to convert it into specific JSON output based on certain rules. First, I need to understand the input and what each part means.

The command is "Drive 10 degrees to left." So, breaking it down, the main action here is driving, but with a direction specified—left. The phrase "10 degrees" probably refers to how much to turn the steering wheel. 

Looking at the rules, I'm only supposed to use specific fields: Throttle, Brake, Steer, LeftSignal, and RightSignal. Since the command mentions turning left, that points towards the Steer field. Steer can take values from -45 to 45, with negative being left and positive right. So 10 degrees to the left would translate to a value of -10.

I shou

  WeightNorm.apply(module, name, dim)


In [19]:
import sounddevice as sd
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from kokoro import KPipeline
from langchain_community.llms import Ollama
import json

class DrivingCommands:
    def __init__(self):
        self.throttle = 0
        self.brake = 0
        self.steer = 0
        self.left_signal = 0
        self.right_signal = 0

    def to_dict(self):
        return {
            "Throttle": self.throttle,
            "Brake": self.brake,
            "Steer": self.steer,
            "LeftSignal": self.left_signal,
            "RightSignal": self.right_signal
        }

def record_audio(duration, sample_rate=16000):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """Transcribe audio using Whisper"""
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    inputs = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    predicted_ids = model.generate(
        inputs.input_features,
        attention_mask=inputs.attention_mask,
        language="en",
        task="transcribe"
    )
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def speak_text(text):
    """Convert text to speech using Kokoro and play it immediately"""
    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
        text,
        voice='af_heart',
        speed=1,
        split_pattern=r'\n+'
    )
    
    for _, _, audio in generator:
        sd.play(audio, samplerate=24000)
        sd.wait()

def extract_driving_commands(text):
    """Extract driving commands using Ollama"""
    llm = Ollama(model="deepseek-r1:latest")
    
    system_prompt = """You are a JSON-only driving command interpreter. ONLY OUTPUT VALID JSON - NO OTHER TEXT ALLOWED.

    Valid fields (include only relevant ones):
    - Throttle (0-100)
    - Brake (0-100)
    - Steer (-45 to 45, negative for left, positive for right)
    - LeftSignal (0 or 1)
    - RightSignal (0 or 1)

    Input: "drive slowly" 
    {"Throttle": 20}

    Input: "brake immediately"
    {"Brake": 100}

    Input: "10 degrees left"
    {"Steer": -10}

    Input: "turn on left signal"
    {"LeftSignal": 1}

    REMEMBER: OUTPUT ONLY JSON, NO THINKING, NO EXPLANATIONS!"""
    
    prompt = f"Convert to JSON ONLY: '{text}'"
    
    prompt = f"Convert this driving command to control values: '{text}'"
    
    response = llm.invoke(system_prompt + "\n" + prompt)
    
    try:
        # Clean up the response by finding the JSON part
        response_text = response.strip()
        
        # If response contains any JSON-like structure, extract it
        if '{' in response_text and '}' in response_text:
            start_idx = response_text.find('{')
            end_idx = response_text.rfind('}') + 1
            json_str = response_text[start_idx:end_idx]
            
            # Parse the JSON and validate the values
            parsed = json.loads(json_str)
            
            # Ensure values are in correct ranges
            if 'Throttle' in parsed:
                parsed['Throttle'] = max(0, min(100, parsed['Throttle']))
            if 'Brake' in parsed:
                parsed['Brake'] = max(0, min(100, parsed['Brake']))
            if 'Steer' in parsed:
                parsed['Steer'] = max(-45, min(45, parsed['Steer']))
            if 'LeftSignal' in parsed:
                parsed['LeftSignal'] = 1 if parsed['LeftSignal'] else 0
            if 'RightSignal' in parsed:
                parsed['RightSignal'] = 1 if parsed['RightSignal'] else 0
                
            return parsed
        
        print("No valid JSON found in response:", response_text)
        return {}
    except Exception as e:
        print(f"Error parsing LLM response: {str(e)}")
        print("Raw response:", response_text)
        return {}

def main():
    try:
        # Record audio (5 seconds)
        audio_array = record_audio(5)
        
        # Transcribe the audio
        print("Transcribing...")
        transcribed_text = transcribe_audio(audio_array)
        print("\nTranscription:", transcribed_text)
        
        # Extract driving commands
        print("\nExtracting driving commands...")
        commands = extract_driving_commands(transcribed_text)
        print("Driving commands:", commands)
        
        # Generate confirmation speech
        confirmation = "Executing commands: "
        for key, value in commands.items():
            confirmation += f"{key} {value}, "
        
        # Speak the confirmation
        print("\nGenerating and playing speech...")
        speak_text(confirmation)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Recording for 5 seconds...
Recording finished!
Transcribing...

Transcription:  Drive 10 degrees to the left.

Extracting driving commands...
Error parsing LLM response: Extra data: line 1 column 15 (char 14)
Raw response: <think>
Okay, so I need to figure out how to convert the driving command "Drive 10 degrees to the left." into the correct JSON control values. Let me break down the problem step by step.

First, I'll look at the input command: "Drive 10 degrees to the left." The goal is to translate this into a JSON object with specific fields as outlined. The valid fields are Throttle, Brake, Steer, LeftSignal, and RightSignal.

The command mentions driving to the left, which directly relates to the 'Steer' field. Since it's 10 degrees to the left, I should set the Steer value accordingly. From the examples given earlier, when someone says "10 degrees left," the output was {"Steer": -10}. So applying that logic here, since moving left is negative, Steer would be -10.

Now, looking a

  WeightNorm.apply(module, name, dim)


In [21]:
import sounddevice as sd
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from kokoro import KPipeline
from langchain_community.llms import Ollama

def record_audio(duration, sample_rate=16000):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """Transcribe audio using Whisper"""
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    inputs = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    predicted_ids = model.generate(
        inputs.input_features,
        attention_mask=inputs.attention_mask,
        language="en",
        task="transcribe"
    )
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def speak_text(text):
    """Convert text to speech using Kokoro and play it immediately"""
    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
        text,
        voice='af_heart',
        speed=1,
        split_pattern=r'\n+'
    )
    
    for _, _, audio in generator:
        sd.play(audio, samplerate=24000)
        sd.wait()

def get_llm_response(text):
    """Get driving commands interpretation from LLM"""
    llm = Ollama(model="deepseek-r1:latest")
    
    system_prompt = """You are a JSON-only driving command interpreter. ONLY OUTPUT VALID JSON - NO OTHER TEXT ALLOWED.

    Valid fields (include only relevant ones):
    - Throttle (0-100)
    - Brake (0-100)
    - Steer (-45 to 45, negative for left, positive for right)
    - LeftSignal (0 or 1)
    - RightSignal (0 or 1)

    Input: "drive slowly" 
    {"Throttle": 20}

    Input: "brake immediately"
    {"Brake": 100}

    Input: "10 degrees left"
    {"Steer": -10}

    Input: "turn on left signal"
    {"LeftSignal": 1}

    REMEMBER: OUTPUT ONLY JSON, NO THINKING, NO EXPLANATIONS!"""
    
    prompt = f"Convert to JSON ONLY: '{text}'"
    return llm.invoke(system_prompt + "\n" + prompt)

def main():
    try:
        # Record audio (5 seconds)
        audio_array = record_audio(5)
        
        # Transcribe the audio
        print("Transcribing...")
        transcribed_text = transcribe_audio(audio_array)
        print("\nTranscription:", transcribed_text)
        
        # Get LLM response
        print("\nGetting LLM response...")
        response = get_llm_response(transcribed_text)
        print("Response:", response)
        
        # Speak the response
        print("\nGenerating and playing speech...")
        speak_text(response)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Recording for 5 seconds...
Recording finished!
Transcribing...

Transcription:  Drive 10 degrees to left

Getting LLM response...
Response: <think>
Okay, let's see. The user has given me a command to interpret and output only valid JSON based on their instructions. They mentioned that the input is "Drive 10 degrees to left" and provided an example response of {"Steer": -10}.

First, I need to parse the input correctly. The phrase "Drive 10 degrees to left" contains a command related to steering. Breaking it down, the user wants to steer 10 degrees to the left. 

Looking at the valid fields, Steer is allowed between -45 and 45, with negative values indicating left turns. So, steering left by 10 degrees translates directly to setting the Steer field to -10.

I should make sure that only the relevant field is included in the JSON output without any extra text or other fields like Throttle, Brake, etc., as per their instructions.

So, the correct JSON response would be {"Steer": -10} since

  WeightNorm.apply(module, name, dim)


In [24]:
import sounddevice as sd
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from kokoro import KPipeline
from langchain_community.llms import Ollama

def record_audio(duration, sample_rate=16000):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """Transcribe audio using Whisper"""
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    inputs = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    predicted_ids = model.generate(
        inputs.input_features,
        attention_mask=inputs.attention_mask,
        language="en",
        task="transcribe"
    )
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def speak_text(text):
    """Convert text to speech using Kokoro and play it immediately"""
    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
        text,
        voice='af_heart',
        speed=1,
        split_pattern=r'\n+'
    )
    
    for _, _, audio in generator:
        sd.play(audio, samplerate=24000)
        sd.wait()

def get_llm_response(text):
    """Get driving commands interpretation from LLM"""
    llm = Ollama(model="deepseek-r1:latest")
    
    system_prompt = """You are a JSON-only driving command interpreter. ONLY OUTPUT VALID JSON - NO OTHER TEXT ALLOWED.

    Valid fields (include only relevant ones):
    - Throttle (0-100)
    - Brake (0-100)
    - Steer (-45 to 45, negative for left, positive for right)
    - LeftSignal (0 or 1)
    - RightSignal (0 or 1)

    Input: "drive slowly" 
    {"Throttle": 20}

    Input: "brake immediately"
    {"Brake": 100}

    Input: "10 degrees left"
    {"Steer": -10}

    Input: "turn on left signal"
    {"LeftSignal": 1}

    REMEMBER: OUTPUT ONLY JSON, NO THINKING, NO EXPLANATIONS!"""
    
    prompt = f"Convert to JSON ONLY: '{text}'"
    return llm.invoke(system_prompt + "\n" + prompt)

def get_clean_response(response):
    """Extract JSON part from response, removing thinking section"""
    if '{' in response and '}' in response:
        start = response.find('{')
        end = response.rfind('}') + 1
        return response[start:end]
    return response

def main():
    try:
        # Record audio (5 seconds)
        audio_array = record_audio(5)
        
        # Transcribe the audio
        print("Transcribing...")
        transcribed_text = transcribe_audio(audio_array)
        print("\nTranscription:", transcribed_text)
        
        # Get LLM response
        print("\nGetting LLM response...")
        full_response = get_llm_response(transcribed_text)
        print("Full response:", full_response)
        
        # Clean response and speak only the JSON part
        clean_response = get_clean_response(full_response)
        print("\nSpeaking clean response:", clean_response)
        speak_text(clean_response)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Recording for 5 seconds...
Recording finished!
Transcribing...

Transcription:  usual driving 10 degrees to left

Getting LLM response...
Full response: <think>
Okay, so the user has given me a query where I need to interpret a driving command and output only valid JSON. Let's see what they provided.

First, they mentioned that I should only output JSON without any other text. The valid fields are Throttle (0-100), Brake (0-100), Steer (-45 to 45), LeftSignal (0 or 1), and RightSignal (0 or 1).

Looking at the examples they gave, each input corresponds directly to a JSON object with specific fields. For instance, "drive slowly" translates to {"Throttle": 20}, which makes sense because throttle is set to a lower value for slow driving.

The next input is "brake immediately", leading to {"Brake": 100}. That's straightforward since the brake field takes values from 0-100, so full braking would be 100.

Then there's "10 degrees left" resulting in {"Steer": -10}. Steer can range from -45 to

  WeightNorm.apply(module, name, dim)


KeyboardInterrupt: 

In [26]:
import sounddevice as sd
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from kokoro import KPipeline
from langchain_community.llms import Ollama

def record_audio(duration, sample_rate=16000):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """Transcribe audio using Whisper"""
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    inputs = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    predicted_ids = model.generate(
        inputs.input_features,
        attention_mask=inputs.attention_mask,
        language="en",
        task="transcribe"
    )
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def speak_text(text):
    """Convert text to speech using Kokoro and play it immediately"""
    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
        text,
        voice='af_heart',
        speed=1,
        split_pattern=r'\n+'
    )
    
    for _, _, audio in generator:
        sd.play(audio, samplerate=24000)
        sd.wait()

def get_llm_response(text):
    """Get driving commands interpretation from LLM"""
    llm = Ollama(model="deepseek-r1:latest")
    
    system_prompt = """You are a JSON-only driving command interpreter. ONLY OUTPUT VALID JSON - NO OTHER TEXT ALLOWED.

    Valid fields (include only relevant ones):
    - Throttle (0-100)
    - Brake (0-100)
    - Steer (-45 to 45, negative for left, positive for right)
    - LeftSignal (0 or 1)
    - RightSignal (0 or 1)

    Input: "drive slowly" 
    {"Throttle": 20}

    Input: "brake immediately"
    {"Brake": 100}

    Input: "10 degrees left"
    {"Steer": -10}

    Input: "turn on left signal"
    {"LeftSignal": 1}

    REMEMBER: OUTPUT ONLY JSON, NO THINKING, NO EXPLANATIONS!"""
    
    prompt = f"Convert to JSON ONLY: '{text}'"
    return llm.invoke(system_prompt + "\n" + prompt)

def get_clean_response(response):
    """Get response after </think> tag"""
    if '</think>' in response:
        return response.split('</think>')[-1].strip()
    return response

def main():
    try:
        # Record audio (5 seconds)
        audio_array = record_audio(5)
        
        # Transcribe the audio
        print("Transcribing...")
        transcribed_text = transcribe_audio(audio_array)
        print("\nTranscription:", transcribed_text)
        
        # Get LLM response
        print("\nGetting LLM response...")
        full_response = get_llm_response(transcribed_text)
        print("Full response:", full_response)
        
        # Clean response and speak only the JSON part
        clean_response = get_clean_response(full_response)
        print("\nSpeaking clean response:", clean_response)
        speak_text(clean_response)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Recording for 5 seconds...
Recording finished!
Transcribing...

Transcription:  Please break immediately.

Getting LLM response...
Full response: <think>
Okay, so I need to figure out how to respond to the user's command. The user wrote, "Please break immediately." which means they want me to brake fully. Looking at the valid fields provided, Brake is a field that can take values from 0 to 100. 

I remember that in previous interactions, when someone asked to brake, I set Brake to 100, like in the example where the input was "brake immediately" and the output was {"Brake": 100}. So following that pattern, since the user is asking to break immediately now, the appropriate action would be to set Brake to its maximum value.

I should make sure not to include any other fields because the input only mentions breaking. There's no mention of throttle, steer, signals, etc., so those fields don't need to be included in the JSON response. 

So putting it all together, the correct JSON output wou

  WeightNorm.apply(module, name, dim)


In [29]:
llm = Ollama(model="deepseek-r1:1.5b")
text = "please brake immediately"
    
system_prompt = """
You are a driving command interpreter. Convert natural language commands into specific control values.
Output should be JSON with these possible fields (only include relevant ones):
- Throttle (0-100)
- Brake (0-100)
- Steer (-45 to 45, negative for left, positive for right)
- LeftSignal (0 or 1)
- RightSignal (0 or 1)

Examples:
"drive slowly" -> {"Throttle": 20}
"brake immediately" -> {"Brake": 100}
"you're too far left" -> {"Steer": 10}
"turn on left signal" -> {"LeftSignal": 1}
"drive at 50 kph" -> {"Throttle": 50}
"brake 75 percent" -> {"Brake": 75}
"steer 20 degrees right" -> {"Steer": 20}

Only output the JSON, nothing else.
THINK FAST, DON'T OVERTHINK IT!
"""

prompt = f"Convert this driving command to control values: '{text}'"

response = llm.invoke(system_prompt + "\n" + prompt)

In [30]:
response

'<think>\nOkay, so I need to convert the natural language command "please brake immediately" into specific control values. Let me start by understanding what each part of the command means and how it translates to the given fields.\n\nFirst, the user mentioned that the output should be JSON with certain fields only. The possible fields are Throttle (0-100), Brake (0-100), Steer (-45 to 45, left is negative, right positive), LeftSignal (0 or 1), and RightSignal (0 or 1). So I need to identify which of these values apply here.\n\nThe command given is "please brake immediately." Let me break this down. The main part of the sentence seems to be "brake immediately." So the action being taken is to brake, right? Now, looking at the Brake field, it needs to be set between 0 and 100. Since the user is asking to brake immediately, I can assume they\'re wanting a moderate braking force. Maybe 75% would make sense because "brake 75 percent" was provided as an example earlier.\n\nWait, but in that

In [1]:
import sounddevice as sd
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from kokoro import KPipeline
from langchain_community.llms import Ollama

def record_audio(duration, sample_rate=16000):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """Transcribe audio using Whisper"""
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    inputs = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    predicted_ids = model.generate(
        inputs.input_features,
        attention_mask=inputs.attention_mask,
        language="en",
        task="transcribe"
    )
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def speak_text(text):
    """Convert text to speech using Kokoro and play it immediately"""
    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
        text,
        voice='af_heart',
        speed=1,
        split_pattern=r'\n+'
    )
    
    for _, _, audio in generator:
        sd.play(audio, samplerate=24000)
        sd.wait()

def get_llm_response(text):
    """Get driving commands interpretation from LLM"""
    llm = Ollama(model="deepseek-r1:1.5b")
    
    system_prompt = """You are a JSON-only driving command interpreter. ONLY OUTPUT VALID JSON - NO OTHER TEXT ALLOWED.

    Valid fields (include only relevant ones):
    - Throttle (0-100)
    - Brake (0-100)
    - Steer (-45 to 45, negative for left, positive for right)
    - LeftSignal (0 or 1)
    - RightSignal (0 or 1)

    Input: "drive slowly" 
    {"Throttle": 20}

    Input: "brake immediately"
    {"Brake": 100}

    Input: "10 degrees left"
    {"Steer": -10}

    Input: "turn on left signal"
    {"LeftSignal": 1}

    REMEMBER: OUTPUT ONLY JSON, NO THINKING, NO EXPLANATIONS!"""
    
    prompt = f"Convert to JSON ONLY: '{text}'"
    return llm.invoke(system_prompt + "\n" + prompt)

def get_clean_response(response):
    """Get response after </think> tag"""
    if '</think>' in response:
        return response.split('</think>')[-1].strip()
    return response

def main():
    try:
        # Record audio (5 seconds)
        audio_array = record_audio(5)
        
        # Transcribe the audio
        print("Transcribing...")
        transcribed_text = transcribe_audio(audio_array)
        print("\nTranscription:", transcribed_text)
        
        # Get LLM response
        print("\nGetting LLM response...")
        full_response = get_llm_response(transcribed_text)
        print("Full response:", full_response)
        
        # Clean response and speak only the JSON part
        clean_response = get_clean_response(full_response)
        print("\nSpeaking clean response:", clean_response)
        speak_text(clean_response)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Recording for 5 seconds...
Recording finished!
Transcribing...


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.



Transcription:  Please lightly break

Getting LLM response...


  llm = Ollama(model="deepseek-r1:1.5b")


Full response: <think>
Alright, let me try to figure out how to interpret the input string "Please lightly break" into the corresponding JSON fields. 

First, I'll look at each of the valid fields available:

- Throttle (0-100)
- Brake (0-100)
- Steer (-45 to 45, negative for left, positive for right)
- LeftSignal (0 or 1)
- RightSignal (0 or 1)

Now, breaking down the input string "Please lightly break". The main action here is "break", which could relate to either brake pedal or perhaps braking in general. However, looking at the valid fields, there's no direct field for a command like "break" itself. Instead, there are commands like brake and steer.

The phrase "lightly" suggests that the action isn't very vigorous. Since brake can handle more forceful actions with its 0-100 range, using brake might be appropriate here to indicate a controlled break. 

Additionally, since it's about breaking, maybe the throttle could be set slightly to allow some movement without overstepping. Throt

  WeightNorm.apply(module, name, dim)


In [11]:
import sounddevice as sd
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from kokoro import KPipeline
from langchain_community.llms import Ollama

def record_audio(duration, sample_rate=16000):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate),
                  samplerate=sample_rate,
                  channels=1,
                  dtype=np.float32)
    sd.wait()
    print("Recording finished!")
    return audio.flatten()

def transcribe_audio(audio_array, sample_rate=16000):
    """Transcribe audio using Whisper"""
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    
    inputs = processor(
        audio_array,
        sampling_rate=sample_rate,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    predicted_ids = model.generate(
        inputs.input_features,
        attention_mask=inputs.attention_mask,
        language="en",
        task="transcribe"
    )
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def speak_text(text):
    """Convert text to speech using Kokoro and play it immediately"""
    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
        text,
        voice='af_heart',
        speed=1,
        split_pattern=r'\n+'
    )
    
    for _, _, audio in generator:
        sd.play(audio, samplerate=24000)
        sd.wait()

def get_llm_response(text):
    """Get driving commands interpretation from LLM"""
    llm = Ollama(model="deepseek-r1:1.5b")
    
    system_prompt = """You are a JSON-only driving command interpreter. ONLY OUTPUT VALID JSON - NO OTHER TEXT ALLOWED.

    Valid fields (include only relevant ones):
    - Throttle (0-100)
    - Brake (0-100)
    - Steer (-45 to 45, negative for left, positive for right)
    - LeftSignal (0 or 1)
    - RightSignal (0 or 1)
    DON'T ADD ANY OTHER FIELDS. JUST CORRELATE THE COMMANDS TO THE FIELDS.
    BECAUSE MAYBE THE INPUT TEXT CONTAINS TYPOS

    Input: "drive slowly" 
    {"Throttle": 20}

    Input: "drive faster" 
    {"Throttle": 30-40} Choose any value between 30-40, don't write 30-40 but choose one value, for example 35

    Input: "brake immediately"
    {"Brake": 100}

    Input: "brake lightly"
    {"Brake": 20-30} Choose any value between 20-30, don't write 20-30 but choose one value, for example 25

    Input: "10 degrees left"
    {"Steer": -10}

    Input: "turn on left signal"
    {"LeftSignal": 1}

    REMEMBER: OUTPUT ONLY JSON, NO THINKING, NO EXPLANATIONS!
    JUST OUTPUT ONE COMMAND PER INPUT. I MEAN {"COMMAND": VALUE} ONLY.
    COMMAND IS JUST THROTTLE, BRAKE, STEER, LEFTSIGNAL, OR RIGHTSIGNAL.
    IF YOU CAN'T FIND ANY COMMAND, JUST OUTPUT AN EMPTY JSON {}
    """
    
    prompt = f"Convert to JSON ONLY: '{text}'"
    return llm.invoke(system_prompt + "\n" + prompt)

def get_clean_response(response):
    """Get response after </think> tag"""
    if '</think>' in response:
        return response.split('</think>')[-1].strip()
    return response

def main():
    try:
        # Record audio (5 seconds)
        audio_array = record_audio(5)
        
        # Transcribe the audio
        print("Transcribing...")
        transcribed_text = transcribe_audio(audio_array)
        print("\nTranscription:", transcribed_text)
        
        # Get LLM response
        print("\nGetting LLM response...")
        full_response = get_llm_response(transcribed_text)
        print("Full response:", full_response)
        
        # Clean response and speak only the JSON part
        clean_response = get_clean_response(full_response)
        print("\nSpeaking clean response:", clean_response)
        speak_text(clean_response)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Recording for 5 seconds...
Recording finished!
Transcribing...

Transcription:  Please lightly break.

Getting LLM response...
Full response: <think>
Okay, let's see. I need to convert the input string "Please lightly break." into a JSON command format using the fields Throttle, Brake, Steer, LeftSignal, and RightSignal. 

First, I'll break down the sentence. The main action here is "break," which relates to the brake field. The phrase "lightly" indicates that it's not a full stop or a sudden impact but rather something subtle. So, I should use the Brake field with a value between 20-30 since it's a slight brake press.

Wait, let me check if there are other commands that might apply here. The input is "Please lightly break," which doesn't mention anything about steering or signals. It's specifically addressing the action of breaking. So, sticking with Brake makes sense because it's the direct command to brake as needed.

I don't see any clues about throttle, steer, or right signal in t

  WeightNorm.apply(module, name, dim)


In [13]:
from langchain_community.llms import Ollama
from PIL import Image
import base64
import io

def encode_image_to_base64(image_path):
    """Convert an image file to base64 string"""
    with Image.open(image_path) as img:
        # Convert to RGB if necessary
        if img.mode != 'RGB':
            img = img.convert('RGB')
        
        # Convert to base64
        buffered = io.BytesIO()
        img.save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue()).decode()
        return img_str

def process_driving_command_from_image(image_path):
    """Process an image to extract driving commands"""
    
    # Initialize the model
    llm = Ollama(model="deepseek-r1:latest")
    
    # Encode the image
    base64_image = encode_image_to_base64(image_path)
    
    system_prompt = """
    You are a driving command interpreter that analyzes images. Convert visual information into specific control values.
    The image might contain road signs, traffic signals, obstacles, or text commands.
    
    Output should be JSON with these possible fields (only include relevant ones):
    - Throttle (0-100)
    - Brake (0-100)
    - Steer (-45 to 45, negative for left, positive for right)
    - LeftSignal (0 or 1)
    - RightSignal (0 or 1)
    
    Examples of image interpretations:
    - Red traffic light -> {"Brake": 100}
    - Stop sign -> {"Brake": 100}
    - Right turn ahead sign -> {"RightSignal": 1, "Steer": 45}
    - Speed limit 50 sign -> {"Throttle": 50}
    - Object blocking right side -> {"Steer": -20}
    
    Only output the JSON, nothing else.
    THINK FAST, DON'T OVERTHINK IT!
    """
    
    # Create the prompt with the image
    prompt = f"""
    Analyze this image and convert it to driving control values:
    <image>{base64_image}</image>
    """
    
    # Get the response
    response = llm.invoke(system_prompt + "\n" + prompt)
    return response

# Example usage
if __name__ == "__main__":
    image_path = "/home/irman/Book-Recomender-using-LLM/Screenshot from 2025-01-07 14-09-22.png"
    result = process_driving_command_from_image(image_path)
    print(result)

<think>
Alright, let me try to figure out what's going on with this image. So, I see a bunch of text here that looks like it might be part of some kind of puzzle or maybe even an image generated from text using something like ASCII art. The user has provided the text as a base64 encoded string, which probably represents an image when decoded.

Looking at the text itself, there are mentions of "81j/4R81j/4R", "KKK", and "0ooooo0". Hmm, that makes me think it's some sort of grid or pattern. Maybe a maze? Or perhaps a hidden message where certain characters represent walls or paths.

I also notice some repeated phrases like "UUU" and "RRR", which might be part of a pattern as well. The text seems to have multiple lines with varying amounts of spacing, so maybe it's structured in sections that need to be aligned correctly.

Since the user mentioned ASCII art, I should probably convert this base64 string into an image. Let me try to decode it first. Base64 decoding involves taking the strin

In [16]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/429 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.2k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.55M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.74k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


In [19]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

class DrivingCommandInterpreter:
    def __init__(self):
        self.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Initialize processor and model
        self.processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")
        self.model = AutoModelForVision2Seq.from_pretrained(
            "HuggingFaceTB/SmolVLM-256M-Instruct",
            torch_dtype=torch.bfloat16,
        ).to(self.DEVICE)

    def process_image(self, image_path):
        """Process an image to extract driving commands"""
        # Load image
        if isinstance(image_path, str):
            image = load_image(image_path)
        else:
            image = image_path  # Assume it's already a PIL Image

        # Create input messages with driving command context
        # messages = [
        #     {
        #         "role": "user",
        #         "content": [
        #             {"type": "image"},
        #             {"type": "text", "text": """
        #             Analyze this image and output driving commands as JSON.
        #             Only include relevant fields from:
        #             - Throttle (0-100)
        #             - Brake (0-100)
        #             - Steer (-45 to 45, negative for left, positive for right)
        #             - LeftSignal (0 or 1)
        #             - RightSignal (0 or 1)
                    
        #             Examples:
        #             - Red light -> {"Brake": 100}
        #             - Stop sign -> {"Brake": 100}
        #             - Right turn ahead -> {"RightSignal": 1, "Steer": 45}
        #             - Speed limit 50 -> {"Throttle": 50}
        #             - Obstacle on right -> {"Steer": -20}
                    
        #             Output ONLY the JSON, nothing else.
        #             """}
        #         ]
        #     },
        # ]

        messages = "what is in that image? what is the minimum distance?"

        # Prepare inputs
        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
        inputs = inputs.to(self.DEVICE)

        # Generate outputs
        with torch.no_grad():
            generated_ids = self.model.generate(**inputs, max_new_tokens=100)
        
        generated_text = self.processor.batch_decode(
            generated_ids,
            skip_special_tokens=True,
        )[0]

        return generated_text

# Example usage
if __name__ == "__main__":
    interpreter = DrivingCommandInterpreter()
    
    # Example with a local image
    result = interpreter.process_image("/home/irman/Book-Recomender-using-LLM/Screenshot from 2025-01-07 14-09-22.png")
    print(result)
    
    # Example with a URL
    # result = interpreter.process_image("https://example.com/traffic_sign.jpg")
    # print(result)

Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


UndefinedError: 'str object' has no attribute 'content'

In [2]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

class DrivingCommandInterpreter:
    def __init__(self):
        self.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Initialize processor and model
        self.processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
        self.model = AutoModelForVision2Seq.from_pretrained(
            "HuggingFaceTB/SmolVLM-Instruct",
            torch_dtype=torch.bfloat16,
        ).to(self.DEVICE)

    def process_image(self, image_path):
        """Process an image to extract driving commands"""
        # Load image
        if isinstance(image_path, str):
            image = load_image(image_path)
        else:
            image = image_path  # Assume it's already a PIL Image

        # Create input messages with proper structure
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": """
                    what is in that image? what is the minimum distance?

                    from the image and distance, determine the driving commands.
                    Valid fields (include only relevant ones):
                    - Throttle (0-100)
                    - Brake (0-100)
                    - Steer (-45 to 45, negative for left, positive for right)
                    
                    DON'T ADD ANY OTHER FIELDS. JUST CORRELATE THE COMMANDS TO THE FIELDS.
                    BECAUSE MAYBE THE INPUT TEXT CONTAINS TYPOS

                    Input: "drive slowly" 
                    {"Throttle": 20}

                    Input: "brake immediately"
                    {"Brake": 100}

                    Input: "10 degrees left"
                    {"Steer": -10}

                    REMEMBER: OUTPUT ONLY JSON, NO THINKING, NO EXPLANATIONS!
                     
                    JUST OUTPUT ONE COMMAND PER INPUT. I MEAN {"COMMAND": VALUE} ONLY.
                    COMMAND IS JUST THROTTLE, BRAKE, STEER.
                     """}
                ]
            }
        ]

        # Prepare inputs
        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
        inputs = inputs.to(self.DEVICE)

        # Generate outputs
        with torch.no_grad():
            generated_ids = self.model.generate(**inputs, max_new_tokens=100)
        
        generated_text = self.processor.batch_decode(
            generated_ids,
            skip_special_tokens=True,
        )[0]

        return generated_text

# Example usage
if __name__ == "__main__":
    interpreter = DrivingCommandInterpreter()
    
    # Example with a local image
    result = interpreter.process_image("/home/irman/Book-Recomender-using-LLM/Screenshot from 2025-01-07 14-09-22.png")
    print(result)

Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
