In [None]:
from llama_cpp import Llama

In [None]:
# llm = Llama(model_path="../llama.cpp/models/Mistral-7B-v0.1/mistral-7b-v0.1.q5-k-m.gguf", n_threads=4, verbose=True)
llm = Llama(model_path="models/mistral-7b-instruct-v0.1.gguf", n_threads=4, verbose=True)

In [None]:
output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=[], echo=True)

In [None]:
output

In [None]:
import os

# Run a command in the terminal
os.system(f"say {'Hey, I am a llama. I am a language model that can generate text.'}")

vocal_output = "What can I do for you?"

os.system(f"say {vocal_output}")

In [None]:
print("hey")

Ideas:
We want to talk to this AI, right? We don't want, however, to have to wait for a whopping 30sec every time we need to say something.
The first thing is to know when we have some kind of interaction expected by the user. We could potentially only feed the transcribed text to Mistral, or hopefully a much lighter model, to see if it's a question or a command.
If it is, then we feed it to a larger model that is going to think. And then, if an action is required, we're going to transfer to a model that is going to do the action.

Okay so, here's what we can do. Every 5 seconds, we run the fastest whisper wrapper we have, over the past 20-25 seconds that have been asked by the user, and then we make the text go through an small LLM to see if it's a question or a command. Say... Mistral quantized? Or what other, smaller model could we use? Let's check the leaderboard.

In [None]:
from llama_cpp import Llama

llm = Llama(model_path="models/mistral-7b-instruct-v0.1.gguf", n_threads=6, verbose=False)

In [None]:
def is_instruction(text):
    llm_input = "[INST]You are an AI that's designed to recognize when a user is calling you for assistance. If they don't ask you, it's NOT a request or instruction. " \
        "Determine if the following transcript contains a direct instruction or request for your help:\n\n" \
        f"'{text}'" \
        "\n\nReply with 'True' if there's a direct instruction or request, and 'False' if not. Please be exact in your response.[/INST]"
        
    # print(llm_input)
    output = llm(llm_input)
    
    return True if "True" in output["choices"][0]["text"] else (False if "False" in output["choices"][0]["text"] else None)

is_instruction("Can you show me my latest emails?")

In [None]:
def is_instruction_2(text):
    llm_input = "[INST]You are Jarvis, an AI that's designed to recognize when a user is calling you for assistance. A call for assistance must include an explicit call to you, your name is 'Jarvis' " \
        "Determine if the following transcript contains a direct instruction or request for your help:\n\n" \
        f"'{text}'" \
        "\n\nReply with 'True' if there's a direct instruction or request, and 'False' if not. Please be exact in your response.[/INST]"
        
    # print(llm_input)
    output = llm(llm_input)
    
    return True if "True" in output["choices"][0]["text"] else (False if "False" in output["choices"][0]["text"] else None)

In [None]:
is_instruction_2("Hello Jarvis, how are you?")

In [None]:
is_instruction("I like trains.")

In [None]:
is_instruction("Yeah so I was on the phone with Phillis and")

Okay good so, so far, Mistral-Instruct-Q5_K_M seems to work well for this! Step 1 completed. Now, we need some speech recognition.

In [None]:
from faster_whisper import WhisperModel
import os

# Add the CT2_VERBOSE=1 flag to the environment variables
os.environ["CT2_VERBOSE"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

model_size = "medium.en"

model = WhisperModel(model_size, device="cpu", compute_type="int8")

In [None]:
segments, info = model.transcribe("mp3s/1.mp3")
transcribed_segments = list(segments)
# 10sec for 1.mp3 in float32
# 6sec in int8

In [None]:
segments, info = model.transcribe("mp3s/2.mp3")
transcribed_segments = list(segments)
# 10sec for 2.mp3
# 6.7sec in int8

In [None]:
transcribed_text = " ".join([segment.text for segment in transcribed_segments])
transcribed_text

In [None]:
is_instruction_2(transcribed_text)
# 19sec for 2.mp3

Seems to work pretty fine! Now we need to automate the audio collection.

In [None]:
import sounddevice as sd
import numpy as np
from pydub import AudioSegment

def record_audio(filename, duration, samplerate=44100):
    print("Recording...")
    myrecording = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, dtype='int16')
    sd.wait()  # Wait until recording is finished
    print("Recording finished.")
    
    # Convert recording to AudioSegment for easy export
    
    print(myrecording.dtype.itemsize)
    
    audio = AudioSegment(
        myrecording.tobytes(),
        frame_rate=samplerate,
        sample_width=myrecording.dtype.itemsize,
        channels=1
    )
    
    audio.export(filename, format="mp3", bitrate="128k")
    print(f"File saved as {filename}")

# Usage example:
record_audio("output.mp3", 2)  # Records for 10 seconds


In [None]:
import pygame.mixer
import time

def play_audio(filename):
    # Initialize the mixer module
    pygame.mixer.init()
    pygame.mixer.music.load(filename)
    
    print(f"Playing {filename}...")
    pygame.mixer.music.play()

    # This will keep the program running while the audio plays
    while pygame.mixer.music.get_busy():
        time.sleep(0.1)

    print("Playback finished.")

# Usage example:
play_audio("output.mp3")


In [None]:
from faster_whisper import WhisperModel
import os

# Add the CT2_VERBOSE=1 flag to the environment variables
os.environ["CT2_VERBOSE"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

model_size = "medium.en"

whisper = WhisperModel(model_size, device="cpu", compute_type="int8")

segments, info = model.transcribe("output.mp3")
transcribed_segments = list(segments)
transcribed_text = " ".join([segment.text for segment in transcribed_segments])
transcribed_text

Alright, cool! Now all together!

In [42]:
from faster_whisper import WhisperModel
import os
import sounddevice as sd
import numpy as np
from pydub import AudioSegment
import pygame.mixer
import time


# Add the CT2_VERBOSE=1 flag to the environment variables
os.environ["CT2_VERBOSE"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
model_size = "medium.en"
whisper = WhisperModel(model_size, device="cpu", compute_type="int8")

def record_audio(filename, duration, samplerate=44100):
    print("Recording...")
    myrecording = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, dtype='int16')
    sd.wait()  # Wait until recording is finished
    print("Recording finished.")
    
    # Convert recording to AudioSegment for easy export
    
    print(myrecording.dtype.itemsize)
    
    audio = AudioSegment(
        myrecording.tobytes(),
        frame_rate=samplerate,
        sample_width=myrecording.dtype.itemsize,
        channels=1
    )
    
    audio.export(filename, format="mp3", bitrate="128k")
    print(f"File saved as {filename}")


def play_audio(filename):
    # Initialize the mixer module
    pygame.mixer.init()
    pygame.mixer.music.load(filename)
    
    print(f"Playing {filename}...")
    pygame.mixer.music.play()

    # This will keep the program running while the audio plays
    while pygame.mixer.music.get_busy():
        time.sleep(0.1)

    print("Playback finished.")


def transcribe_recording(filename):
    segments, info = model.transcribe(filename)
    transcribed_segments = list(segments)
    transcribed_text = " ".join([segment.text for segment in transcribed_segments])
    return transcribed_text

def record_and_transcribe(filename, duration):
    print(f"Recording for {duration} seconds...")
    record_audio(filename, duration)
    print("Recording finished. Now transcribing...")
    return transcribe_recording(filename)

[2023-10-11 14:00:58.465] [ctranslate2] [thread 1096624] [info] Loaded model /Users/axelpeytavin/.cache/huggingface/hub/models--guillaumekln--faster-whisper-medium.en/snapshots/83a3b718775154682e5f775bc5d5fc961d2350ce on device cpu:0
[2023-10-11 14:00:58.465] [ctranslate2] [thread 1096624] [info]  - Binary version: 6
[2023-10-11 14:00:58.465] [ctranslate2] [thread 1096624] [info]  - Model specification revision: 3
[2023-10-11 14:00:58.465] [ctranslate2] [thread 1096624] [info]  - Selected compute type: int8_float32


In [43]:
record_and_transcribe("temp.mp3", 3)

# 9.7sec

Recording for 3 seconds...
Recording...
Recording finished.
2
File saved as temp.mp3
Recording finished. Now transcribing...


' Hey, how is it going?'