In [1]:
# %pip install --upgrade pip setuptools wheel

# %pip install openai openai-whisper soundfile plotnine assemblyai langchain
## need `sudo apt install portaudio19-dev alsa-utils ffmpeg` for pyaudio/whisper
# We need extra steps outside of code to make audio work in WSL2.

In [2]:
%load_ext autoreload
%autoreload 2


In [3]:
PROJECT_DIR = "/mnt/c/Users/allsw/projects/structured-voice-logging"
AUDIO_DIR = f"{PROJECT_DIR}/test_recordings"
LOGS_DIR = "{PROJECT_DIR}/logs/"

In [5]:
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["ASSEMBLY_AI_API_KEY"] = ""

In [6]:
import json

import openai
import whisper
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
import assemblyai

import svl

In [7]:
transcriber = svl.Transcriber(audio_dir=AUDIO_DIR)

In [11]:
audio_fpath = transcriber.get_file_substring_match('sleep1')
transcript = transcriber.transcribe(audio_fpath)

In [12]:
transcript['text']

'I woke up at 10:40 a.m. And I went to sleep around 245 a.'

In [31]:
url = transcriber._upload(audio_fpath)


In [40]:
response = transcriber._transcribe_upload(url)

In [41]:
response

{'id': 'rhh7m2ssh0-e261-4ad7-a71c-94c56ee19fa8',
 'language_model': 'assemblyai_default',
 'acoustic_model': 'assemblyai_default',
 'language_code': 'en_us',
 'status': 'queued',
 'audio_url': 'https://cdn.assemblyai.com/upload/b2176603-809f-404f-b82d-5211d78aeca7',
 'text': None,
 'words': None,
 'utterances': None,
 'confidence': None,
 'audio_duration': None,
 'punctuate': True,
 'format_text': True,
 'dual_channel': None,
 'webhook_url': None,
 'webhook_status_code': None,
 'webhook_auth': False,
 'webhook_auth_header_name': None,
 'speed_boost': False,
 'auto_highlights_result': None,
 'auto_highlights': False,
 'audio_start_from': None,
 'audio_end_at': None,
 'word_boost': [],
 'boost_param': None,
 'filter_profanity': False,
 'redact_pii': False,
 'redact_pii_audio': False,
 'redact_pii_audio_quality': None,
 'redact_pii_policies': None,
 'redact_pii_sub': None,
 'speaker_labels': False,
 'content_safety': False,
 'iab_categories': False,
 'content_safety_labels': {},
 'iab_cat

In [33]:
transcript

{'id': 'rhh7pihe1y-a27d-4df8-803d-52568cf55a3d',
 'language_model': 'assemblyai_default',
 'acoustic_model': 'assemblyai_default',
 'language_code': 'en_us',
 'status': 'queued',
 'audio_url': 'https://cdn.assemblyai.com/upload/b2176603-809f-404f-b82d-5211d78aeca7',
 'text': None,
 'words': None,
 'utterances': None,
 'confidence': None,
 'audio_duration': None,
 'punctuate': True,
 'format_text': True,
 'dual_channel': None,
 'webhook_url': None,
 'webhook_status_code': None,
 'webhook_auth': False,
 'webhook_auth_header_name': None,
 'speed_boost': False,
 'auto_highlights_result': None,
 'auto_highlights': False,
 'audio_start_from': None,
 'audio_end_at': None,
 'word_boost': [],
 'boost_param': None,
 'filter_profanity': False,
 'redact_pii': False,
 'redact_pii_audio': False,
 'redact_pii_audio_quality': None,
 'redact_pii_policies': None,
 'redact_pii_sub': None,
 'speaker_labels': False,
 'content_safety': False,
 'iab_categories': False,
 'content_safety_labels': {},
 'iab_cat

In [9]:
llm = OpenAI(temperature=0) # type: ignore

In [10]:
files = svl.LogFilesFinder(transcript['text'], LOGS_DIR, llm).recommended_files # type: ignore

In [11]:
files

'wake_up_for_the_day.csv, hours_slept_last_night.csv'

In [71]:
transcript_logger = svl.TranscriptLogger(transcript_text=transcript['text'], logs_dir=LOGS_DIR, files=files, llm=llm)

In [72]:
print(transcript_logger._prompt)


## Instructions ##
    Your objective is to write log entries for a user, based on an audio transcript.
    The user who recorded the audio is trying to log something, or multiple somethings. It is also possible the transcript is not trying to log anything.
    If it's trying to log something, write a log entry for each thing the user is trying to log.
    I will give you a list of files that correspond to topics that can be logged to. Log the right thing to the right file.
    
## Output Structure ##
    Output a JSON object where each key is a file name and each value is the log entry for that file.
    If the user is not trying to log anything, output an empty JSON object.
    
## Relevant files, and samples of their contents ##
    /mnt/c/Users/allsw/projects/svl/logs/wake_up_for_the_day.csv:

    Date,Time,Day,Amount,Notes
    Jul 11 2022,10:30:00,Mon,,\"\"
    Jul 08 2022,11:00:00,Fri,,\"\"
    Jul 06 2022,09:20:00,Wed,,\"\"

    /mnt/c/Users/allsw/projects/svl/logs/hours_slept_

In [73]:
completion = transcript_logger.get_json_completion()

In [74]:
print(completion)

{'wake_up_for_the_day.csv': 'Jul 11 2022,10:48:00,Mon,,""', 'hours_slept_last_night.csv': 'Jul 11 2022,02:45:00,Mon,6.25,""'}


: 

# Notes
Maybe it's useful to have two separate prompt streams that are getting history and context added to them: one for taking in the voice transcript and deciding what the commands to execute are, and one that takes those commands and executes them on the logs. Call the first one CommandBuilder and the second one CommandExecutor. This could make it easier to keep the different kinds of context organized. The context for CommandBuilder is the user's attempts to log things, and the Executor is the backend's attempts to modify the logs based on those commands.  

CommandExecutor is an interesting idea - previously I was thinking I'd define fixed functions myself to do these operations on the log file. And that's probably still the correct idea. But, like, you could also give it the user's logs in JSON and then tell it to make the add. This scales poorly with the number of logs a user has, obviously. If we kept each log category in a separate file, and only give the contents of the relevant file as prompt context, that's a lot more tractable.

And since logging something means adding one line to the bottom of a csv, you don't need to give the whole thing to the prompt! Just enough lines that it knows the format! Ahaaaaaa!!! Let's do this :D :D D:
