In [1]:
import sys
sys.path.append('../utils/')
sys.path.append('../models/')

In [2]:
import pandas as pd
import numpy as np
import pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoTokenizer, BitsAndBytesConfig
import torch

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from embeddings import Embedder
from preprocess import create_sentance_booklet, create_faise_index
import faiss
from utils import search_content, read_booklets, retrieve_booklet_text, rename_booklets
from llama import llama
from response_generator import get_response, extract_keyword

# Read in Data

In [4]:
pwd  = str(pathlib.Path().cwd().parent.resolve())

In [5]:
rename_booklets((pwd + "/data/data/booklets/"))

In [6]:
df_booklet = read_booklets((pwd + "/data/data/booklets/"))

In [7]:
df_booklet.head(3)

Unnamed: 0,index,text,book
0,2,THIRD EDITION,booklet6
1,3,BOOKLET SIX: SECTION 11,booklet6
2,4,￼,booklet6


Since we need to create embeddings of the text, it is important to know how long text is since it can influence the tokenization for some models (can end up truncating text)

In [8]:
df_booklet['textLength'] = [len(text) for text in df_booklet['text']]
print("Maximum Character lenght is: ", df_booklet['textLength'].max())

Maximum Character lenght is:  18747


There paragraphs are long, so we might need to consider spliting text on sentences to make them shorter.

It seem like the following steps will have to be taken:

- embed booklet
- embed search phrase
- use search phrase embedding to search for relevant text in booklet
- retrive all relevant text from booklet
- format search phrase and into prompt for LLM
- Send promt to LLM and return answer

# Try simple model
I will first use all-mpnet-base-v2  as the sentance embedder and then I will use Llama as the LLM .

- Download: `wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin`
- Then run: pip install llama-cpp-python==0.1.78


1. Embed all sentances

In [9]:
embedding_model = Embedder("bert-base-uncased")

In [10]:
# If you have not create embeddings uncomment below
booklet_embeddings = embedding_model.embed(df_booklet['text'].values)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [05:22<00:00,  1.97s/it]


2. Creat faiss index for search.

In [11]:
# If you have not create the index uncomment below
fastIndex = create_faise_index(booklet_embeddings)

In [12]:
# Save the index
faiss.write_index(fastIndex, pwd + "/data/data/resources/paragraph_index.faiss")
df_booklet.to_csv(pwd + "/data/data/resources/booklet.csv")


In [13]:
# Read in index
fastIndex = faiss.read_index( pwd + "/data/data/resources/paragraph_index.faiss")

3. Search embeddings and get response

In [14]:
llm_model = llama(pwd + "/llama-2-7b-chat.ggmlv3.q4_1.bin")

llama.cpp: loading model from /Users/brendentaylor/Documents/zindi_llm/llama-2-7b-chat.ggmlv3.q4_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 5.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 3 (mostly Q4_1)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 4017.35 MB (+  256.00 MB per state)
llama_new_context_with_model: 

In [15]:
text = "Which key information should be recorded during a register review about suspected cases, and how is this information utilized in the subsequent investigation activities?"

In [16]:
df_search_results = search_content(query=text, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=5)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.27it/s]


In [17]:
%%time
response = get_response(text=text, llm=llm_model, df_matches=df_search_results)


llama_print_timings:        load time = 21748.47 ms
llama_print_timings:      sample time =    80.45 ms /   121 runs   (    0.66 ms per token,  1504.00 tokens per second)
llama_print_timings: prompt eval time = 21748.42 ms /   391 tokens (   55.62 ms per token,    17.98 tokens per second)
llama_print_timings:        eval time = 155959.42 ms /   120 runs   ( 1299.66 ms per token,     0.77 tokens per second)
llama_print_timings:       total time = 177950.85 ms


4. Find keywords

In [20]:
response

{'answer': 'The key information that should be recorded during a register review about suspected cases includes:\n\n1. Date of onset of symptoms or presentation of disease\n2. Age, sex, and other demographic characteristics of the patient\n3. Location of the case (e.g., hospital, clinic, community)\n4. Nature of the disease or condition (diagnosis or suspicion)\n5. Source of infection or exposure (if known or suspected)\n6. Contact information for the patient (name, address, phone number)\n7. Out',
 'book': 'TG Booklet 2',
 'Paragraph': '743-881'}

In [18]:
response["keywords"] = extract_keyword(response['text'], top_n=6)

KeyError: 'text'