In [5]:
import llm
import sqlite_utils
import chromadb

## Do Embeddings of All Sentences Using LLM lib

you should have already at the command line installed `llm`, and done 

```
llm install llm-gguf
llm gguf download-embed-model \
  'https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1/resolve/main/gguf/mxbai-embed-xsmall-v1-q8_0.gguf'
```

  per the post by Simon Willison here: https://simonwillison.net/2024/Nov/21/llm-gguf-embeddings/
  
  (Note I had to move to a remote VM due to a bug with the llama wrapper on my Mac.)

In [12]:
# llm embed -m gguf/mxbai-embed-xsmall-v1-q8_0 -c 'hello'
# -d my-embeddings.db

In [2]:
model = "gguf/mxbai-embed-xsmall-v1-q8_0"

In [3]:
files = !ls top100_sents_filtered/

In [4]:
files[0]

'10007_sents_filt.txt'

In [46]:
def get_metadata(file):
    id = file.split("_")[0]
    return {"bookid": id, "source_file": file}

def split_list(list, n=20):
    return [list[i:i+n] for i in range(0, len(list), n)]

In [25]:
#!rm top100_mxbai.db

In [3]:
embedding_model = llm.get_embedding_model(model)
#db = sqlite_utils.Database("top100_mxbai.db")

In [48]:
def read_sents(file, path="top100_sents_filtered/"):
    with open(path + file) as handle:
        lines = handle.readlines()
    return lines

In [99]:
vector = embedding_model.embed("hello")

llama_new_context_with_model: n_ctx_per_seq (512) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


In [132]:
len(vector)

384

In [186]:
collection = llm.Collection("sentences", db, model=embedding_model)

```
collection.embed_multi_with_metadata(
    [
        ("hound", "my happy hound", {"name": "Hound"}),
        ("cat", "my dissatisfied cat", {"name": "Cat"}),
    ],
    # This can also take the store=True argument:
    store=True,
)
```

In [35]:
def process_file(file):
    lines = read_sents(file)
    print(file, len(lines))
    batches = split_list(lines)
    base_metadata = get_metadata(file) # gets book id and source file
    line_counter = 1
    for i, batch in enumerate(batches):
        text_with_meta = []
        for line in batch:
            meta = base_metadata
            meta['line_num'] = line_counter
            id_label = meta['bookid'] + "_" + str(line_counter)
            text_with_meta.append((id_label, line, meta))
            line_counter += 1
        collection.embed_multi_with_metadata(text_with_meta, store=True)
        print("did batch", i)
    print("did lines", line_counter)
    

In [None]:
for file in files[1:]:
    process_file(file)

In [187]:
# this is very slow, move the data to a better db (for more complex queries easier)
for entry in collection.similar("I ate a snack.", number=5):
    print(entry.id, entry.score, entry.content, entry.metadata)

llama_new_context_with_model: n_ctx_per_seq (512) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


76_320 0.6128033444880707 All I could get to eat was berries and what was left over from breakfast.
 {'bookid': '76', 'source_file': '76_sents_filt.txt', 'line_num': 320}
1400_48 0.5813657630128987 On the present occasion, though I was hungry, I dared not eat my slice.
 {'bookid': '1400', 'source_file': '1400_sents_filt.txt', 'line_num': 60}
1400_289 0.5732558734145862 I was hungry, but before I had swallowed a morsel, he began a running sum that lasted all through the breakfast.
 {'bookid': '1400', 'source_file': '1400_sents_filt.txt', 'line_num': 300}
1260_255 0.5628181339842894 I devoured my bread and drank my coffee with relish; but I should have been glad of as much more—I was still hungry.
 {'bookid': '1260', 'source_file': '1260_sents_filt.txt', 'line_num': 260}
84_1064 0.5302388452178833 I awoke exhausted, and finding that it was already night, I crept forth from my hiding-place, and went in search of food.
 {'bookid': '84', 'source_file': '84_sents_filt.txt', 'line_num': 1080}

In [46]:
db.close()

## Get Embeddings from SQLITE and Export for Use in Chroma DB

In [2]:
import pandas as pd
import sqlite3

In [3]:
conn = sqlite3.connect('top100_mxbai.db')
c = conn.cursor()

In [None]:
df = pd.read_sql('SELECT * from embeddings', conn)
df.to_parquet('embedded.parquet', index = False)

In [6]:
df.head()

Unnamed: 0,collection_id,id,embedding,content,content_blob,content_hash,metadata,updated
0,1,10007_1,b'\x1a\xf9\x81\xbdl\xbd\xa7>\xb50e=\x1c\x9bx=|...,Upon a paper attached to the Narrative which f...,,b'\xe7h\x13\x07\x831[W\xd2\x1c53\xebf\xacc',"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170
1,1,10007_2,b'0j&\xbb\xee\xd8\x80>\x98\x99\xf7=\xfft\x00>\...,"This mysterious subject he treats, in that Ess...",,b'\x10\x8a\xb9dI\x90\xaf\x1b)\xd4\x94\xfb\x9b\...,"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170
2,1,10007_3,b'\\\x06\x12\xbf\xbfba\xbe\xb1\x01\x14\xbeJr\x...,It will form but one volume of the series of t...,,b'?i(\xffw\xcfxG$\xd8H\xe1\xdf\xdd\xdf\xda',"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170
3,1,10007_4,b'\x80\xab\xa0\xbe\x9a\xafc>\xce\xa6q\xbd\xdeD...,"I was anxious on discovering this paper, to re...",,b'Bb\x91D\x81\x94\xcc\x9f.\xd6\x12\x1a\xff_\xb...,"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170
4,1,10007_5,b'3\x8e\xf5<P\x85\x11>\xbd\xccx>\xea8>>4F\xba<...,"Much to my regret, however, I found that she h...",,b'\x05RB\x18\xfdq\xa1#_\xb3\xa6\xd6@/y>',"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170


In [4]:
df = pd.read_parquet("embedded.parquet")

In [5]:
df.head()

Unnamed: 0,collection_id,id,embedding,content,content_blob,content_hash,metadata,updated
0,1,10007_1,b'\x1a\xf9\x81\xbdl\xbd\xa7>\xb50e=\x1c\x9bx=|...,Upon a paper attached to the Narrative which f...,,b'\xe7h\x13\x07\x831[W\xd2\x1c53\xebf\xacc',"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170
1,1,10007_2,b'0j&\xbb\xee\xd8\x80>\x98\x99\xf7=\xfft\x00>\...,"This mysterious subject he treats, in that Ess...",,b'\x10\x8a\xb9dI\x90\xaf\x1b)\xd4\x94\xfb\x9b\...,"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170
2,1,10007_3,b'\\\x06\x12\xbf\xbfba\xbe\xb1\x01\x14\xbeJr\x...,It will form but one volume of the series of t...,,b'?i(\xffw\xcfxG$\xd8H\xe1\xdf\xdd\xdf\xda',"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170
3,1,10007_4,b'\x80\xab\xa0\xbe\x9a\xafc>\xce\xa6q\xbd\xdeD...,"I was anxious on discovering this paper, to re...",,b'Bb\x91D\x81\x94\xcc\x9f.\xd6\x12\x1a\xff_\xb...,"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170
4,1,10007_5,b'3\x8e\xf5<P\x85\x11>\xbd\xccx>\xea8>>4F\xba<...,"Much to my regret, however, I found that she h...",,b'\x05RB\x18\xfdq\xa1#_\xb3\xa6\xd6@/y>',"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170


In [6]:
embed_test = df.iloc[0]['embedding']

In [7]:
import numpy as np

In [8]:
float_array = np.frombuffer(embed_test, dtype=np.float32)

In [14]:
df['vector'] = df['embedding'].apply(lambda x: np.frombuffer(x, dtype=np.float32))

In [15]:
df.head()

Unnamed: 0,collection_id,id,embedding,content,content_blob,content_hash,metadata,updated,vector
0,1,10007_1,b'\x1a\xf9\x81\xbdl\xbd\xa7>\xb50e=\x1c\x9bx=|...,Upon a paper attached to the Narrative which f...,,b'\xe7h\x13\x07\x831[W\xd2\x1c53\xebf\xacc',"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170,"[-0.063463405, 0.32761705, 0.055954654, 0.0606..."
1,1,10007_2,b'0j&\xbb\xee\xd8\x80>\x98\x99\xf7=\xfft\x00>\...,"This mysterious subject he treats, in that Ess...",,b'\x10\x8a\xb9dI\x90\xaf\x1b)\xd4\x94\xfb\x9b\...,"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170,"[-0.0025392883, 0.25165504, 0.120898426, 0.125..."
2,1,10007_3,b'\\\x06\x12\xbf\xbfba\xbe\xb1\x01\x14\xbeJr\x...,It will form but one volume of the series of t...,,b'?i(\xffw\xcfxG$\xd8H\xe1\xdf\xdd\xdf\xda',"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170,"[-0.57040954, -0.22010325, -0.1445377, 0.08371..."
3,1,10007_4,b'\x80\xab\xa0\xbe\x9a\xafc>\xce\xa6q\xbd\xdeD...,"I was anxious on discovering this paper, to re...",,b'Bb\x91D\x81\x94\xcc\x9f.\xd6\x12\x1a\xff_\xb...,"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170,"[-0.31380844, 0.22234955, -0.058996968, -0.013..."
4,1,10007_5,b'3\x8e\xf5<P\x85\x11>\xbd\xccx>\xea8>>4F\xba<...,"Much to my regret, however, I found that she h...",,b'\x05RB\x18\xfdq\xa1#_\xb3\xa6\xd6@/y>',"{""bookid"": ""10007"", ""source_file"": ""10007_sent...",1732551170,"[0.029975032, 0.14211011, 0.24296851, 0.185763..."


In [16]:
newdf = df[['id', 'vector', 'metadata', 'content']]

In [17]:
newdf.head()

Unnamed: 0,id,vector,metadata,content
0,10007_1,"[-0.063463405, 0.32761705, 0.055954654, 0.0606...","{""bookid"": ""10007"", ""source_file"": ""10007_sent...",Upon a paper attached to the Narrative which f...
1,10007_2,"[-0.0025392883, 0.25165504, 0.120898426, 0.125...","{""bookid"": ""10007"", ""source_file"": ""10007_sent...","This mysterious subject he treats, in that Ess..."
2,10007_3,"[-0.57040954, -0.22010325, -0.1445377, 0.08371...","{""bookid"": ""10007"", ""source_file"": ""10007_sent...",It will form but one volume of the series of t...
3,10007_4,"[-0.31380844, 0.22234955, -0.058996968, -0.013...","{""bookid"": ""10007"", ""source_file"": ""10007_sent...","I was anxious on discovering this paper, to re..."
4,10007_5,"[0.029975032, 0.14211011, 0.24296851, 0.185763...","{""bookid"": ""10007"", ""source_file"": ""10007_sent...","Much to my regret, however, I found that she h..."


In [18]:
import json
def get_meta_source(row, which='source_file'):
    return json.loads(row['metadata'])[which]

In [19]:
import json
def get_meta_book(row, which='bookid'):
    return json.loads(row['metadata'])[which]

In [20]:
newdf['source'] = newdf.apply(get_meta_source, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf['source'] = newdf.apply(get_meta_source, axis=1)


In [21]:
newdf['bookid'] = newdf.apply(get_meta_book, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf['bookid'] = newdf.apply(get_meta_book, axis=1)


In [22]:
newdf['line_num'] = newdf['id'].apply(lambda x: int(x.split("_")[1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf['line_num'] = newdf['id'].apply(lambda x: int(x.split("_")[1]))


In [23]:
newdf.head()

Unnamed: 0,id,vector,metadata,content,source,bookid,line_num
0,10007_1,"[-0.063463405, 0.32761705, 0.055954654, 0.0606...","{""bookid"": ""10007"", ""source_file"": ""10007_sent...",Upon a paper attached to the Narrative which f...,10007_sents_filt.txt,10007,1
1,10007_2,"[-0.0025392883, 0.25165504, 0.120898426, 0.125...","{""bookid"": ""10007"", ""source_file"": ""10007_sent...","This mysterious subject he treats, in that Ess...",10007_sents_filt.txt,10007,2
2,10007_3,"[-0.57040954, -0.22010325, -0.1445377, 0.08371...","{""bookid"": ""10007"", ""source_file"": ""10007_sent...",It will form but one volume of the series of t...,10007_sents_filt.txt,10007,3
3,10007_4,"[-0.31380844, 0.22234955, -0.058996968, -0.013...","{""bookid"": ""10007"", ""source_file"": ""10007_sent...","I was anxious on discovering this paper, to re...",10007_sents_filt.txt,10007,4
4,10007_5,"[0.029975032, 0.14211011, 0.24296851, 0.185763...","{""bookid"": ""10007"", ""source_file"": ""10007_sent...","Much to my regret, however, I found that she h...",10007_sents_filt.txt,10007,5


In [24]:
newdf = newdf.drop('metadata', axis=1)

## Read in Latest Data File with Emdbeds

In [25]:
newdf = newdf[~newdf['source'].str.contains("30254")] # porn

In [26]:
newdf = newdf[~newdf['source'].str.contains("37106")] # dupe

In [27]:
newdf.to_json("embedded_data.json", orient="records", index=None)

In [28]:
import pandas as pd
newdf = pd.read_json("embedded_data.json", orient="records")

In [29]:
newdf.columns

Index(['id', 'vector', 'content', 'source', 'bookid', 'line_num'], dtype='object')

In [31]:
def make_id(row):
    return str(row['bookid']) + "_" + str(row['line_num'])

In [32]:
newdf['id'] = newdf.apply(make_id, axis=1)

In [33]:
len(newdf)

170228

In [34]:
newdf.head()

Unnamed: 0,id,vector,content,source,bookid,line_num
0,10007_1,"[-0.0634634048, 0.3276170492, 0.05595465380000...",Upon a paper attached to the Narrative which f...,10007_sents_filt.txt,10007,1
1,10007_2,"[-0.0025392883, 0.2516550422, 0.1208984256, 0....","This mysterious subject he treats, in that Ess...",10007_sents_filt.txt,10007,2
2,10007_3,"[-0.5704095364, -0.220103249, -0.1445377022, 0...",It will form but one volume of the series of t...,10007_sents_filt.txt,10007,3
3,10007_4,"[-0.3138084412, 0.2223495543, -0.0589969680000...","I was anxious on discovering this paper, to re...",10007_sents_filt.txt,10007,4
4,10007_5,"[0.0299750324, 0.1421101093, 0.242968514600000...","Much to my regret, however, I found that she h...",10007_sents_filt.txt,10007,5


In [35]:
newrows = newdf.to_dict(orient="records")

In [36]:
newrows[0].keys()

dict_keys(['id', 'vector', 'content', 'source', 'bookid', 'line_num'])

In [37]:
from itertools import islice
def batched(iterable, n):
    iterator = iter(iterable)
    while batch := list(islice(iterator, n)):
        yield batch

def removekeys(row, keys=['id', 'vector']):
    for key in keys:
        del row[key]
    return row

def make_chroma_recs(rows):
    ids = [row['id'] for row in rows]
    embeddings = [row['vector'] for row in rows]
    documents = [row['content'] for row in rows]
    copy = rows.copy()
    metadatas = [removekeys(x) for x in copy]
    return ids, embeddings, documents, metadatas
    
    

In [None]:
#upset_batches(newrows)

## ChromaDB

In [None]:
!pip install chromadb

In [38]:
import chromadb

In [6]:
client = chromadb.PersistentClient(path="chroma_db/")

In [40]:
collection = client.create_collection(name="top100", metadata={"hnsw:space": "cosine"})

In [20]:
#client.delete_collection("top100")

```
collection.add(
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    ids=["id1", "id2", "id3", ...]
)
```

In [41]:
def chroma_batches(rows):
    for batch in batched(rows, 100):
        ids, embeddings, documents, metadatas = make_chroma_recs(batch)
        collection.add(
            embeddings=embeddings,
            metadatas=metadatas,
            ids=ids,
            documents=documents)
    print('done')
    return 

In [42]:
chroma_batches(newrows)

done


In [8]:
def search_chroma(string, keyword=None, n=10):
    vector = embedding_model.embed(string)
    if keyword:
        res = collection.query(
            query_embeddings=[vector],
            n_results=n,
            #where={"metadata_field": "is_equal_to_this"},
            where_document={"$contains":keyword}
        )
    else:
        res = collection.query(
            query_embeddings=[vector],
            n_results=n,
            #where={"metadata_field": "is_equal_to_this"},
            #where_document={"$contains":keyword}
        )
    return res

In [7]:
collection = client.get_collection("top100")

In [9]:
search_chroma("looked out the window")

llama_new_context_with_model: n_ctx_per_seq (512) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


ValueError: Failed to create llama_context