In [None]:
from llama_cpp import Llama

In [None]:
# llm = Llama(model_path="../llama.cpp/models/Mistral-7B-v0.1/mistral-7b-v0.1.q5-k-m.gguf", n_threads=4, verbose=True)
llm = Llama(model_path="models/mistral-7b-instruct-v0.1.gguf", n_threads=4, verbose=True)

In [None]:
output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=[], echo=True)

In [None]:
output

In [None]:
import os

# Run a command in the terminal
os.system(f"say {'Hey, I am a llama. I am a language model that can generate text.'}")

vocal_output = "What can I do for you?"

os.system(f"say {vocal_output}")

In [None]:
print("hey")

Ideas:
We want to talk to this AI, right? We don't want, however, to have to wait for a whopping 30sec every time we need to say something.
The first thing is to know when we have some kind of interaction expected by the user. We could potentially only feed the transcribed text to Mistral, or hopefully a much lighter model, to see if it's a question or a command.
If it is, then we feed it to a larger model that is going to think. And then, if an action is required, we're going to transfer to a model that is going to do the action.

Okay so, here's what we can do. Every 5 seconds, we run the fastest whisper wrapper we have, over the past 20-25 seconds that have been asked by the user, and then we make the text go through an small LLM to see if it's a question or a command. Say... Mistral quantized? Or what other, smaller model could we use? Let's check the leaderboard.

In [None]:
from llama_cpp import Llama

llm = Llama(model_path="models/mistral-7b-instruct-v0.1.gguf", n_threads=6, verbose=False)

In [None]:
def is_instruction(text):
    llm_input = "[INST]You are an AI that's designed to recognize when a user is calling you for assistance. If they don't ask you, it's NOT a request or instruction. " \
        "Determine if the following transcript contains a direct instruction or request for your help:\n\n" \
        f"'{text}'" \
        "\n\nReply with 'True' if there's a direct instruction or request, and 'False' if not. Please be exact in your response.[/INST]"
        
    # print(llm_input)
    output = llm(llm_input)
    
    return True if "True" in output["choices"][0]["text"] else (False if "False" in output["choices"][0]["text"] else None)

is_instruction("Can you show me my latest emails?")

In [None]:
def is_instruction_2(text):
    llm_input = "[INST]You are Jarvis, an AI that's designed to recognize when a user is calling you for assistance. A call for assistance must include an explicit call to you, your name is 'Jarvis' " \
        "Determine if the following transcript contains a direct instruction or request for your help:\n\n" \
        f"'{text}'" \
        "\n\nReply with 'True' if there's a direct instruction or request, and 'False' if not. Please be exact in your response.[/INST]"
        
    # print(llm_input)
    output = llm(llm_input)
    
    return True if "True" in output["choices"][0]["text"] else (False if "False" in output["choices"][0]["text"] else None)

In [None]:
is_instruction_2("Hello Jarvis, how are you?")

In [None]:
is_instruction("I like trains.")

In [None]:
is_instruction("Yeah so I was on the phone with Phillis and")

Okay good so, so far, Mistral-Instruct-Q5_K_M seems to work well for this! Step 1 completed. Now, we need some speech recognition.

In [None]:
from faster_whisper import WhisperModel
import os

# Add the CT2_VERBOSE=1 flag to the environment variables
os.environ["CT2_VERBOSE"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

model_size = "medium.en"

model = WhisperModel(model_size, device="cpu", compute_type="int8")

In [None]:
segments, info = model.transcribe("mp3s/1.mp3")
transcribed_segments = list(segments)
# 10sec for 1.mp3 in float32
# 6sec in int8

In [None]:
segments, info = model.transcribe("mp3s/2.mp3")
transcribed_segments = list(segments)
# 10sec for 2.mp3
# 6.7sec in int8

In [None]:
transcribed_text = " ".join([segment.text for segment in transcribed_segments])
transcribed_text

In [None]:
is_instruction_2(transcribed_text)
# 19sec for 2.mp3

Seems to work pretty fine! Now we need to automate the audio collection.

In [None]:
import sounddevice as sd
import numpy as np
from pydub import AudioSegment

def record_audio(filename, duration, samplerate=44100):
    print("Recording...")
    myrecording = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, dtype='int16')
    sd.wait()  # Wait until recording is finished
    print("Recording finished.")
    
    # Convert recording to AudioSegment for easy export
    
    print(myrecording.dtype.itemsize)
    
    audio = AudioSegment(
        myrecording.tobytes(),
        frame_rate=samplerate,
        sample_width=myrecording.dtype.itemsize,
        channels=1
    )
    
    audio.export(filename, format="mp3", bitrate="128k")
    print(f"File saved as {filename}")

# Usage example:
record_audio("output.mp3", 2)  # Records for 10 seconds


In [None]:
import pygame.mixer
import time

def play_audio(filename):
    # Initialize the mixer module
    pygame.mixer.init()
    pygame.mixer.music.load(filename)
    
    print(f"Playing {filename}...")
    pygame.mixer.music.play()

    # This will keep the program running while the audio plays
    while pygame.mixer.music.get_busy():
        time.sleep(0.1)

    print("Playback finished.")

# Usage example:
play_audio("output.mp3")


In [None]:
from faster_whisper import WhisperModel
import os

# Add the CT2_VERBOSE=1 flag to the environment variables
os.environ["CT2_VERBOSE"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

model_size = "medium.en"

whisper = WhisperModel(model_size, device="cpu", compute_type="int8")

segments, info = model.transcribe("output.mp3")
transcribed_segments = list(segments)
transcribed_text = " ".join([segment.text for segment in transcribed_segments])
transcribed_text

Alright, cool! Now all together!

In [1]:
from faster_whisper import WhisperModel
import os
import sounddevice as sd
import numpy as np
from pydub import AudioSegment
import pygame.mixer
import time
from llama_cpp import Llama

def init_models():
    llm = Llama(model_path="models/mistral-7b-instruct-v0.1.gguf", n_threads=6, verbose=False)

    # Add the CT2_VERBOSE=1 flag to the environment variables
    os.environ["CT2_VERBOSE"] = "1"
    os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
    model_size = "medium.en"
    whisper = WhisperModel(model_size, device="cpu", compute_type="int8")
    
    return llm, whisper


llm, whisper = init_models()


def record_audio(filename, duration, samplerate=44100):
    print("Recording...")
    myrecording = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, dtype='int16')
    sd.wait()  # Wait until recording is finished
    print("Recording finished.")
    
    # Convert recording to AudioSegment for easy export
    
    print(myrecording.dtype.itemsize)
    
    audio = AudioSegment(
        myrecording.tobytes(),
        frame_rate=samplerate,
        sample_width=myrecording.dtype.itemsize,
        channels=1
    )
    
    audio.export(filename, format="mp3", bitrate="128k")
    print(f"File saved as {filename}")


def play_audio(filename):
    # Initialize the mixer module
    pygame.mixer.init()
    pygame.mixer.music.load(filename)
    
    print(f"Playing {filename}...")
    pygame.mixer.music.play()

    # This will keep the program running while the audio plays
    while pygame.mixer.music.get_busy():
        time.sleep(0.1)

    print("Playback finished.")


def transcribe_recording(filename):
    segments, info = whisper.transcribe(filename)
    transcribed_segments = list(segments)
    transcribed_text = " ".join([segment.text for segment in transcribed_segments])
    return transcribed_text

def record_and_transcribe(filename, duration):
    print(f"Recording for {duration} seconds...")
    record_audio(filename, duration)
    print("Recording finished. Now transcribing...")
    return transcribe_recording(filename)

def query_llm(text):
    llm_input = f"[INST]You are my AI partner Jarvis. Here is my prompt for you: {text}[/INST]"
        
    # print(llm_input)
    output = llm(llm_input)
    
    return output["choices"][0]["text"]

In [None]:
import os

def talk_to_llm(duration=3):
    text = record_and_transcribe("temp.mp3", duration)

    print("You said:", text)
    response = query_llm(text)
    
    response = response.replace('\'', '"')
    
    print("Jarvis said:", response)
    os.system(f"say '{response}'")
    
# Total length: 46sec for "Hey Jarvis how's it going?"

In [None]:
talk_to_llm(10)

# This is WILD

In [None]:
transcribe_recording

In [None]:
import os
import threading
import time

def process_audio(filename):
    retry_count = 0
    while retry_count < 5:  # Max 5 retries
        try:
            transcription = transcribe_recording(filename)
            print("You said:", transcription)
            if is_instruction(transcription):
                response = query_llm(transcription)
                response = response.replace('\'', '"')
                os.system(f"say '{response}'")
            break
        except ValueError as e:
            print(e)
            # File not finished recording yet? Retry after 0.1 seconds.
            print("File not finished recording yet? Retrying...")
            time.sleep(0.1)
            retry_count += 1

def talk_to_llm_continuous(duration=3):
    while True:
        f_name = f"{time.strftime('%Y-%m-%d %H:%M:%S')}.mp3"
        record_file = record_audio(f_name, duration)
        threading.Thread(target=process_audio, args=(record_file,)).start()
        # Consider adding a slight sleep if you notice gaps or overlaps.

talk_to_llm_continuous()


In [3]:
import threading
import time
from pathlib import Path

class AudioProcessor:
    def __init__(self, duration=30):
        self.duration = duration
        self.chunk_id = 0
        self.run_id = time.strftime('%Y-%m-%d_%H:%M:%S')
        self.data_folder = Path(f"data/run_{self.run_id}")
        self.txt_file = f"transcriptions_{self.run_id}.txt"
        self.running = True

    def stop(self):
        self.running = False

    def record_audio_chunked(self):
        while self.running:
            f_name = str(self.data_folder / f"chunk_{self.chunk_id}.mp3")
            record_audio(f_name, self.duration)
            self.chunk_id += 1
            time.sleep(self.duration)
        print("Recording stopped.")

    def transcribe_audio(self):
        processed_chunks = 0
        while self.running:
            if processed_chunks < self.chunk_id:
                f_name = str(self.data_folder / f"chunk_{processed_chunks}.mp3")
                transcription = transcribe_recording(f_name)
                print(f"Writing transcription '{transcription}' for chunk {processed_chunks} to file...")
                with open(self.txt_file, 'a') as f:
                    f.write(f"<chunk n={processed_chunks} processed=0>\n{transcription}\n</chunk>\n")
                processed_chunks += 1
            time.sleep(1)
        print("Transcription stopped.")

    def process_transcriptions(self):
        while self.running:
            if not Path(self.txt_file).exists():
                time.sleep(1)
                continue
            
            with open(self.txt_file, 'r') as f:
                data = f.read()
                print("Currently in transcripted file: ", data)
                # Here you can add your LLM processing function
                # e.g., response = query_llm(data)
            time.sleep(1)
        print("Processing stopped.")

    def run(self):
        self.data_folder.mkdir(parents=True, exist_ok=True)
        try:
            threading.Thread(target=self.transcribe_audio).start()
            threading.Thread(target=self.process_transcriptions).start()
            self.record_audio_chunked()
        except KeyboardInterrupt:
            print("Received KeyboardInterrupt. Stopping threads...")
            self.stop()

processor = AudioProcessor(duration=10)
processor.run()


Recording...
Recording finished.
2
File saved as data/run_2023-10-12_15:21:59/chunk_0.mp3


Exception in thread Thread-7:
Traceback (most recent call last):
  File "/usr/local/Cellar/python@3.9/3.9.17_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/usr/local/Cellar/python@3.9/3.9.17_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/ng/64vj_sh13fn7kr98h1jq07h40000gn/T/ipykernel_6812/2226987913.py", line 30, in transcribe_audio
  File "/var/folders/ng/64vj_sh13fn7kr98h1jq07h40000gn/T/ipykernel_6812/2876227357.py", line 62, in transcribe_recording
  File "/Users/axelpeytavin/Documents/Projects/self-improving-ai/venv/lib/python3.9/site-packages/faster_whisper/transcribe.py", line 258, in transcribe
    audio = decode_audio(audio, sampling_rate=sampling_rate)
  File "/Users/axelpeytavin/Documents/Projects/self-improving-ai/venv/lib/python3.9/site-packages/faster_whisper/audio.py", line 46, in decode_

Okay so we have a first pipeline working decently well. Let's see briefly if we can make it perform as well, but faster, using...

# Phi 1.5 for Instruction recognition

In [None]:
# Demo code

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.set_default_device("cpu")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)


In [None]:
inputs = tokenizer('''```python
def print_prime(n):
   """
   Print all primes between 1 and n
   """''', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=20)
text = tokenizer.batch_decode(outputs)[0]

# 20 sec for a small generation. Too long. We need it in llama.cpp in gguf format.

In [None]:
print(text)

# Full pipeline for permanent running

In [1]:
from faster_whisper import WhisperModel
import os
import sounddevice as sd
import numpy as np
from pydub import AudioSegment
import pygame.mixer
import time
from llama_cpp import Llama

def init_models():
    llm = Llama(model_path="models/mistral-7b-instruct-v0.1.gguf", n_threads=6, verbose=False)

    # Add the CT2_VERBOSE=1 flag to the environment variables
    os.environ["CT2_VERBOSE"] = "1"
    os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
    model_size = "medium.en"
    whisper = WhisperModel(model_size, device="cpu", compute_type="int8")
    
    return llm, whisper


llm, whisper = init_models()


def record_audio(filename, duration, samplerate=44100):
    print("Recording...")
    myrecording = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, dtype='int16')
    sd.wait()  # Wait until recording is finished
    print("Recording finished.")
    
    # Convert recording to AudioSegment for easy export
    
    print(myrecording.dtype.itemsize)
    
    audio = AudioSegment(
        myrecording.tobytes(),
        frame_rate=samplerate,
        sample_width=myrecording.dtype.itemsize,
        channels=1
    )
    
    audio.export(filename, format="mp3", bitrate="128k")
    print(f"File saved as {filename}")


def play_audio(filename):
    # Initialize the mixer module
    pygame.mixer.init()
    pygame.mixer.music.load(filename)
    
    print(f"Playing {filename}...")
    pygame.mixer.music.play()

    # This will keep the program running while the audio plays
    while pygame.mixer.music.get_busy():
        time.sleep(0.1)

    print("Playback finished.")


def transcribe_recording(filename):
    segments, info = whisper.transcribe(filename)
    transcribed_segments = list(segments)
    transcribed_text = " ".join([segment.text for segment in transcribed_segments])
    return transcribed_text

def record_and_transcribe(filename, duration):
    print(f"Recording for {duration} seconds...")
    record_audio(filename, duration)
    print("Recording finished. Now transcribing...")
    return transcribe_recording(filename)

def query_llm(text):
    llm_input = f"[INST]You are my AI partner Jarvis. Here is my prompt for you: {text}[/INST]"
        
    # print(llm_input)
    output = llm(llm_input)
    
    return output["choices"][0]["text"]

pygame 2.5.2 (SDL 2.28.3, Python 3.9.17)
Hello from the pygame community. https://www.pygame.org/contribute.html


llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from models/mistral-7b-instruct-v0.1.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q5_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q5_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q5_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q6_K     [ 14336,  4096,     1

In [8]:
import threading
import time
from pathlib import Path

class AudioProcessor:
    def __init__(self, duration=30):
        self.duration = duration
        self.chunk_id = 0
        self.run_id = time.strftime('%Y-%m-%d_%H:%M:%S')
        self.data_folder = Path(f"data/run_{self.run_id}")
        self.txt_file = str(self.data_folder /  f"transcriptions_{self.run_id}.txt")
        self.running = True

    def stop(self):
        self.running = False

    def record_audio_chunked(self):
        while self.running:
            f_name = str(self.data_folder / f"chunk_{self.chunk_id}.mp3")
            record_audio(f_name, self.duration)
            self.chunk_id += 1
            # time.sleep(self.duration)
        print("Recording stopped.")

    def transcribe_audio(self):
        processed_chunks = 0
        while self.running:
            if processed_chunks < self.chunk_id:
                f_name = str(self.data_folder / f"chunk_{processed_chunks}.mp3")
                transcription = transcribe_recording(f_name)
                
                # Sometimes, the transcription is hallucinating and contains "Thank you for watching" or some alternative.
                if "Thank" in transcription and "watching" in transcription:
                    print("Tranqcription is hallucinatinating. Noting it...")
                    transcription = "[HALLUCINATION]"
                    
                print(f"Writing transcription '{transcription}' for chunk {processed_chunks} to file...")
                with open(self.txt_file, 'a') as f:
                    f.write(f"<chunk n={processed_chunks} processed=0>\n{transcription}\n</chunk>\n")
                processed_chunks += 1
            time.sleep(1)
        print("Transcription stopped.")

    def process_transcriptions(self):
        while self.running:
            if not Path(self.txt_file).exists():
                time.sleep(1)
                continue
            
            with open(self.txt_file, 'r') as f:
                data = f.read()
                print("Currently in transcripted file: ", data)
                # Here you can add your LLM processing function
                # e.g., response = query_llm(data)
            time.sleep(1)
        print("Processing stopped.")

    def run(self):
        self.data_folder.mkdir(parents=True, exist_ok=True)
        try:
            threading.Thread(target=self.transcribe_audio).start()
            threading.Thread(target=self.process_transcriptions).start()
            self.record_audio_chunked()
        except KeyboardInterrupt:
            print("Received KeyboardInterrupt. Stopping threads...")
            self.stop()

processor = AudioProcessor(duration=10)
processor.run()


Recording...
Recording finished.
2
File saved as data/run_2023-10-12_15:43:07/chunk_0.mp3
Recording...
Writing transcription ' all right let's do it again so this time I'm recording a 10 second chunk this  one is standalone tell me everything's okay' for chunk 0 to file...
Currently in transcripted file:  <chunk n=0 processed=0>
 all right let's do it again so this time I'm recording a 10 second chunk this  one is standalone tell me everything's okay
</chunk>

Currently in transcripted file:  <chunk n=0 processed=0>
 all right let's do it again so this time I'm recording a 10 second chunk this  one is standalone tell me everything's okay
</chunk>

Recording finished.
2
File saved as data/run_2023-10-12_15:43:07/chunk_1.mp3
Recording...
Currently in transcripted file:  <chunk n=0 processed=0>
 all right let's do it again so this time I'm recording a 10 second chunk this  one is standalone tell me everything's okay
</chunk>

Currently in transcripted file:  <chunk n=0 processed=0>
 all r

Processing stopped.
Tranqcription is hallucinatinating. Noting it...
Writing transcription '[HALLUCINATION]' for chunk 4 to file...
Transcription stopped.


In [6]:
transcribe_recording("filtered.mp3")

' Thanks for watching!'