In [73]:
import webvtt
import pandas as pd

def vtt_to_dataframe(vtt_file):
    # Read the VTT file
    captions = webvtt.read(vtt_file)
    
    # Create lists to store data
    start_times = []
    texts = []
    
    # Process each caption
    for caption in captions:
        # Extract start time (truncate to minutes and seconds)
        start_time = caption.start  # Format: 04:12
        
        # Clean and process text (remove newlines, extra spaces)
        text = ' '.join(caption.text.replace('\n', ' ').split())
        
        start_times.append(start_time)
        texts.append(text)
    
    # Create DataFrame
    df = pd.DataFrame({
        'start_time': start_times,
        'text': texts
    })
    
    return df

# Usage
vtt_file_path = 'audios/stream_20250715.vtt'
df = vtt_to_dataframe(vtt_file_path)

In [74]:
import pandas as pd
from nltk.util import ngrams

df['text'] = df['text'].astype(str)
words = df['text']

# Step 1: Group into chunks of 10
chunk_size = 10
chunks = [words.iloc[i:i+chunk_size].tolist() for i in range(0, len(words), chunk_size)]

chunks = [' '.join(c) for c in chunks]
chunks

["Hey guys! Pretend you know us, man. Now I have to put the Oh yeah, I don't even understand the question. But yeah, good morning. Big news today is NVIDIA, they'll be selling their chip in China, so the stock's up 5%. It'll be interesting to see, this is a Q3 thing, although they are on the weird calendar, right, so this would make their quarter. Spiders up 40 bips. I don't know why the webcam does that. Good morning, Jake. Yes, NVIDIA's up 5%, most actively traded by a margin.",
 "Good morning, Hawk. Open Door's up 15%, so microcap. ProKidney's up 12%, vacillating quite a bit there. Archer Aviation, Big Bear, a lot of these stocks are meme stocks, basically. AMD's up 5%, MP is up 11%, weird company. AT&T down 50 bips. Haven't seen this one most actively traded in a long time. I wonder if there's news. We have so much news in Godel, it's insane. Way more than Bloomberg, and way more than any other service. Nobody else is even close, Trading View or Thinkorswim or anything like that, y

In [76]:
import sqlite_utils
import llm
import uuid

db = sqlite_utils.Database("my-embeddings.db")
collection = llm.Collection("entries", db, model_id="3-small")

In [None]:
for i in chunks:
    collection.embed(str(uuid.uuid4()),i,
                     metadata={"date": "20250716"},store=True)

In [79]:
collection.similar("shorts", 10)

[Entry(id='68900f50-174a-42f4-b77a-2359bc5369fa', score=0.42447084581314265, content='yeah yeah yeah clam shorts are better than ever yeah yeah yeah yeah yeah yeah', metadata={'date': '20250715'}),
 Entry(id='e8840101-4638-41ae-b1c1-8e68ae93b779', score=0.3778314390481106, content="So Yeah, I wish I could show you guys this data but it is proprietary So There you go focus So So So So Yeah, humor's probably a good short there's so many good shorts So", metadata={'date': '20250714'}),
 Entry(id='a03816a4-d1e3-4d14-92d1-0ae3ab5fe83f', score=0.31755426418430316, content="The biggest one is still short. I NMB. I think we'll just trade to like 50 cents at some point. Then it's IONQ, D-Wave, Rigetti, QBT, Joby. And I have three longs, Sarepta, Tarsus and Biohaven. So three longs. Six shorts. Inhale. Inhale. Inhale.", metadata={'date': '20250710'}),
 Entry(id='21c85f9f-f86e-4b5b-8181-3878c9bda9c4', score=0.3134374627319056, content="Hmm. Hmm. Hmm. Hmm. I would not short Nvidia here. I would no