In [2]:
import pandas as pd
import numpy as np
import pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoTokenizer, BitsAndBytesConfig
import torch
from utils.embeddings import Embedder
from utils.preprocess import create_sentance_booklet, create_faise_index
import faiss
from utils.utils import search_content, read_booklets, retrieve_booklet_text
from models.llama import llama
from utils.response_generator import get_response, extract_keyword


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'torch'

# Read in Data

In [None]:
pwd  = str(pathlib.Path().cwd().parent.resolve())

In [None]:
df_booklet = read_booklets((pwd + "/data/data/booklets/"))

Since we need to create embeddings of the text, it is important to know how long text is since it can influence the tokenization for some models (can end up truncating text)

In [None]:
df_booklet['textLength'] = [len(text) for text in df_booklet['text']]
print("Maximum Character lenght is: ", df_booklet['textLength'].max())

There paragraphs are long, so we might need to consider spliting text on sentences to make them shorter.

It seem like the following steps will have to be taken:

- embed booklet
- embed search phrase
- use search phrase embedding to search for relevant text in booklet
- retrive all relevant text from booklet
- format search phrase and into prompt for LLM
- Send promt to LLM and return answer

# Try simple model
I will first use all-mpnet-base-v2  as the sentance embedder and then I will use Llama as the LLM .

- Download: `wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin`
- Then run: pip install llama-cpp-python==0.1.78


1. Embed all sentances

In [None]:
embedding_model = Embedder("bert-base-uncased")

In [None]:
# If you have not create embeddings uncomment below
# booklet_embeddings = embedding_model.embed(df_booklet['text'].values)

2. Creat faiss index for search.

In [None]:
# If you have not create the index uncomment below
# fastIndex = create_faise_index(booklet_embeddings)

In [None]:
# Save the index
# faiss.write_index(fastIndex, pwd + "/data/data/resources/paragraph_index.faiss")
# df_booklet.to_csv(pwd + "/data/data/resources/booklet.csv")


In [None]:
# Read in index
fastIndex = faiss.read_index( pwd + "/data/data/resources/paragraph_index.faiss")

3. Search embeddings and get response

In [None]:
llm_model = llama(pwd + "/llama-2-7b-chat.ggmlv3.q4_1.bin")

In [None]:
text = "Which key information should be recorded during a register review about suspected cases, and how is this information utilized in the subsequent investigation activities?"

In [None]:
df_search_results = search_content(query=text, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=5)

In [None]:
response = get_response(text=text, llm=llm_model, df_matches=df_search_results)

4. Find keywords

In [None]:
response["keywords"] = extract_keyword(response['text'], top_n=6)