In [19]:
import pandas as pd
import numpy as np
import pathlib
import torch
from utils.embeddings import Embedder
from utils.preprocess import create_sentance_booklet, create_faise_index
import faiss
from utils.utils import search_content, read_booklets, retrieve_booklet_text
from models.llama import llama
from utils.response_generator import get_response, extract_keyword
from rouge_score import rouge_scorer
from tqdm import tqdm

In [4]:
# Silly Mac that forces me to change the environmental variable to prevent issues running transformers
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Read in Data

In [5]:
pwd  = str(pathlib.Path().cwd().parent.resolve())

In [6]:
df_booklet = read_booklets((pwd + "/data/data/booklets/"))

Since we need to create embeddings of the text, it is important to know how long text is since it can influence the tokenization for some models (can end up truncating text)

In [7]:
df_booklet['textLength'] = [len(text) for text in df_booklet['text']]
print("Maximum Character lenght is: ", df_booklet['textLength'].max())

Maximum Character lenght is:  18747


There paragraphs are long, so we might need to consider spliting text on sentences to make them shorter.

It seem like the following steps will have to be taken:

- embed booklet
- embed search phrase
- use search phrase embedding to search for relevant text in booklet
- retrive all relevant text from booklet
- format search phrase and into prompt for LLM
- Send promt to LLM and return answer

# Try simple model
I will first use all-mpnet-base-v2  as the sentance embedder and then I will use Llama as the LLM .

- Download: `wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin`
- Then run: pip install llama-cpp-python==0.1.78


1. Embed all sentances

In [8]:
embedding_model = Embedder("sentence-transformers/multi-qa-mpnet-base-dot-v1")

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [9]:
# If you have not create embeddings uncomment below
# booklet_embeddings = embedding_model.embed(df_booklet['text'].values)

In [10]:
# np.save(file=(pwd + "/data/data/resources/embeddings" ), arr=booklet_embeddings)

2. Creat faiss index for search.

In [11]:
# If you have not create the index uncomment below
# fastIndex = create_faise_index(booklet_embeddings)

In [12]:
# Save the index
# faiss.write_index(fastIndex, pwd + "/data/data/resources/paragraph_index.faiss")
# df_booklet.to_csv(pwd + "/data/data/resources/booklet.csv")


In [13]:
# Read in index
fastIndex = faiss.read_index( pwd + "/data/data/resources/paragraph_index.faiss")

3. Search embeddings and get response

In [14]:
llm_model = llama(pwd + "/llama-2-7b-chat.ggmlv3.q4_1.bin")

llama.cpp: loading model from /Users/brendentaylor/Documents/zindi_llm/llama-2-7b-chat.ggmlv3.q4_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 5.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 3 (mostly Q4_1)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 4017.35 MB (+  256.00 MB per state)
llama_new_context_with_model: 

# Get performance

Uncomment if you want to test on the train data

In [16]:
df_train = pd.read_csv(pwd +  "/data/data/Train.csv")

In [17]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

In [20]:
%%time
rouge_scores = []
for question in tqdm(df_train.head(1)['Question Text'].values.tolist()):
    print("start")
    df_search_results = search_content(query=question, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=5)
    response = get_response(text=question, llm=llm_model, df_matches=df_search_results)
    response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=6)
    scores = scorer.score(response['answer'], question)
    rouge_scores.append(scores)


  0%|          | 0/1 [00:00<?, ?it/s]

start


llama_tokenize_with_model: too many tokens

llama_print_timings:        load time = 16045.56 ms
llama_print_timings:      sample time =   101.80 ms /   128 runs   (    0.80 ms per token,  1257.42 tokens per second)
llama_print_timings: prompt eval time = 16045.52 ms /    33 tokens (  486.23 ms per token,     2.06 tokens per second)
llama_print_timings:        eval time = 1234603.11 ms /   127 runs   ( 9721.28 ms per token,     0.10 tokens per second)
llama_print_timings:       total time = 1251213.91 ms
100%|██████████| 1/1 [20:51<00:00, 1251.52s/it]

CPU times: user 5min 38s, sys: 4min 47s, total: 10min 26s
Wall time: 20min 51s





In [21]:
rouge_scores

[{'rouge1': Score(precision=0.5, recall=0.11688311688311688, fmeasure=0.1894736842105263)}]

# Make Predictions

In [15]:
df_test = pd.read_csv(pwd +  "/data/data/Test.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/brendentaylor/Documents/zindi_llm/data/data/Test.csv'

In [18]:
df_test.shape

(499, 2)

In [20]:
%%time
responses = []
i = 0
for question in df_test['Question Text'].values.tolist():
    print("question: ", i)
    df_search_results = search_content(query=question, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=5)
    response = get_response(text=question, llm=llm_model, df_matches=df_search_results)
    response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=5)
    responses.append(response)
    i += 1

df_responses = pd.DataFrame(responses)
df_responses['ID'] = df_test['ID']
df_responses['Question'] = df_test['Question Text']
df_responses[["Question", "answer", "book", "Paragraph", "keywords", "ID"]].to_csv(pwd + "/data/data/answers_test.csv")

In [25]:
for index, row in df_responses.iterrows():
    print(row['Question'])
    print(row['answer'])
    print("*******************")

What is the definition of "unusual event"
Based on the information provided, an "unusual event" can be defined as any event or occurrence that is unusual for a specific community or during a certain time of year. This can include:  1. A cluster of deaths from an unknown cause in the same household or adjacent households. 2. Any unusual illness or sudden death in the community within a specific time period (e.g. one week). 3. An unusual death or number of deaths among animals, such as livestock, birds, or rodent species, or an unusually
*******************
What is Community Based Surveillance (CBS)?
*******************
What kind of training should members of VHC receive?
1. What should VHCs receive in terms of training? A) Basic first aid training only B) Training on how to recognize certain diseases or health conditions for reporting suspect cases C) Surveillance and response skills training D) Improved interpersonal skills training using interactive techniques, adult learning methods 