In [1]:
import pandas as pd
from glob import glob
import lancedb


In [2]:
uri = "../../db/lancedb-test"
db = lancedb.connect(uri)

In [3]:
from lancedb.pydantic import Vector, LanceModel

# CSVs

In [5]:
csvs = glob("../../db/crawled/pl/*.csv")

In [6]:
from rich.progress import track

In [7]:
from rich import print

In [8]:
from os.path import basename

In [9]:
from lancedb.embeddings import EmbeddingFunctionRegistry

In [10]:
registry = EmbeddingFunctionRegistry.get_instance()
func = registry.get("sentence-transformers").create(name="all-distilroberta-v1", device="cuda")

In [11]:
class Document(LanceModel):
    document: str = func.SourceField()
    embedding: Vector(func.ndims()) = func.VectorField()
    source: str

In [12]:
table = db.create_table("test", schema=Document)
table.to_pandas()

Unnamed: 0,document,embedding,source


In [13]:
data = []
for csv in track(csvs):
    df = pd.read_csv(csv)
    if len(df)==0: continue
    urls, documents = df["URL"].tolist(), df["Section Content"].tolist()
    base = basename(urls[0])

    data.extend([{"document": d, "source": url} for url, d in zip(urls, documents)])

Output()

In [14]:
table.add(data)

In [15]:
query = """Learning rate scheduler"""

# query = "NeurIPS 2023 LLM Efficiency Challenge Quickstart Guide"

In [33]:
actual = table.search(query, vector_column_name="embedding").limit(1).to_list()

In [36]:
actual[0].keys()

dict_keys(['document', 'embedding', 'source', '_distance'])

In [18]:
# print(result["documents"][0][0])

In [19]:
def read_md(file):
    with open(file, "r") as fr:
        data =  fr.read()
    return data

files = glob("../../db/crawled/tutorials/*.md")
data = []
for file in track(files):
    document = read_md(file)
    base = basename(file)
    data.append(dict(
            document=document,
            source = 'lit-gpt/tutorials/' + base
    )
               )

Output()

In [20]:
table.add(data)

In [21]:
len(table.to_pandas())

1031

In [22]:
from llama_cpp import Llama

In [19]:
# !wget https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_K_M.gguf

In [31]:
llm = Llama(model_path="mistral-7b-v0.1.Q4_K_M.gguf", main_gpu=1, n_ctx=1028, verbose=False, )

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from mistral-7b-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]
llama_mo

In [40]:
prompt = f"""Answer the given question based on the context. If you don't know the answer then respond with I don't know.
Context: {result['documents'][0][0][1024:]}
----
Q: {query}
A:"""


In [41]:
print(prompt)

In [42]:
output = llm(prompt, echo=False, stop=["\n\n"] temperature=0.1,)

In [80]:
llm(
    "Format the question and do not deviate too much - How do I read the results of the Learning Rate finder when using it like a callback? ",
    echo=False, stop=["\n\n"],
    temperature=0.1,
)

{'id': 'cmpl-4f222950-afcf-4a60-bd1e-e9beeecb0a5b',
 'object': 'text_completion',
 'created': 1699117352,
 'model': 'mistral-7b-v0.1.Q4_K_M.gguf',
 'choices': [{'text': ' I am trying to use it as a callback in my training loop.',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 33, 'completion_tokens': 16, 'total_tokens': 49}}

In [43]:
print(output["choices"][0]["text"])