In [1]:
import pandas as pd
import numpy as np
import pathlib
import torch
from utils.embeddings import Embedder
from utils.preprocess import create_sentance_booklet, create_faise_index
import faiss
from utils.utils import search_content, read_booklets, retrieve_booklet_text
from models.llama import llama
from utils.response_generator import get_response, extract_keyword
from rouge_score import rouge_scorer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Silly Mac that forces me to change the environmental variable to prevent issues running transformers
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Read in Data

In [3]:
pwd  = str(pathlib.Path().cwd().parent.resolve())

In [4]:
df_booklet = read_booklets((pwd + "/data/data/booklets/"))

Since we need to create embeddings of the text, it is important to know how long text is since it can influence the tokenization for some models (can end up truncating text)

In [5]:
df_booklet['textLength'] = [len(text) for text in df_booklet['text']]
print("Maximum Character lenght is: ", df_booklet['textLength'].max())

Maximum Character lenght is:  18747


There paragraphs are long, so we might need to consider spliting text on sentences to make them shorter.

It seem like the following steps will have to be taken:

- embed booklet
- embed search phrase
- use search phrase embedding to search for relevant text in booklet
- retrive all relevant text from booklet
- format search phrase and into prompt for LLM
- Send promt to LLM and return answer

# Try simple model
I will first use all-mpnet-base-v2  as the sentance embedder and then I will use Llama as the LLM .

- Download: `wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin`
- Then run: pip install llama-cpp-python==0.1.78


1. Embed all sentances

In [6]:
embedding_model = Embedder("sentence-transformers/multi-qa-mpnet-base-dot-v1")

In [7]:
# If you have not create embeddings uncomment below
# booklet_embeddings = embedding_model.embed(df_booklet['text'].values)

In [8]:
#  np.save(file=(pwd + "/data/data/resources/embeddings" ), arr=booklet_embeddings)

2. Creat faiss index for search.

In [9]:
# If you have not create the index uncomment below
# fastIndex = create_faise_index(booklet_embeddings)

In [10]:
# Save the index
# faiss.write_index(fastIndex, pwd + "/data/data/resources/paragraph_index.faiss")
# df_booklet.to_csv(pwd + "/data/data/resources/booklet.csv")


In [11]:
# Read in index
fastIndex = faiss.read_index( pwd + "/data/data/resources/paragraph_index.faiss")

3. Search embeddings and get response

In [12]:
llm_model = llama(pwd + "/llama-2-7b-chat.ggmlv3.q4_1.bin")

llama.cpp: loading model from /Users/altasaunders/Alta_projects/zindi_llm/llama-2-7b-chat.ggmlv3.q4_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 5.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 3 (mostly Q4_1)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 4017.35 MB (+  256.00 MB per state)
llama_new_context_with_mode

# Get performance

Uncomment if you want to test on the train data

In [13]:
# df_train = pd.read_csv(pwd +  "/data/data/Train.csv")

In [14]:
# scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

In [15]:
# %%time
# rouge_scores = []
# for question in df_train.head(1)['Question Text'].values.tolist():
#     print("start")
#     df_search_results = search_content(query=question, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=5)
#     response = get_response(text=question, llm=llm_model, df_matches=df_search_results)
#     response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=6)
#     scores = scorer.score(response['answer'], question)
#     rouge_scores.append(scores)


In [16]:
# rouge_scores

# Make Predictions

In [17]:
df_test = pd.read_csv(pwd +  "/data/data/Test.csv")

In [18]:
df_test.shape

(499, 2)

In [21]:
# %%time
responses = []
i = 0
for question in df_test['Question Text'].values.tolist():
    df_search_results = search_content(query=question, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=5)
    response = get_response(text=question, llm=llm_model, df_matches=df_search_results)
    response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=5)
    responses.append(response)
    i += 1

df_responses = pd.DataFrame(responses)
df_responses['ID'] = df_test['ID']
df_responses['Question'] = df_test['Question Text']
df_responses[["Question", "question_answer", "reference_document", "paragraph(s)_number", "keywords", "ID"]].to_csv(pwd + "/data/data/answers.csv")


llama_print_timings:        load time = 56059.16 ms
llama_print_timings:      sample time =   101.72 ms /   117 runs   (    0.87 ms per token,  1150.24 tokens per second)
llama_print_timings: prompt eval time = 56059.09 ms /   395 tokens (  141.92 ms per token,     7.05 tokens per second)
llama_print_timings:        eval time = 87869.04 ms /   116 runs   (  757.49 ms per token,     1.32 tokens per second)
llama_print_timings:       total time = 144283.03 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time = 56059.16 ms
llama_print_timings:      sample time =   111.77 ms /   128 runs   (    0.87 ms per token,  1145.22 tokens per second)
llama_print_timings: prompt eval time = 34200.52 ms /   272 tokens (  125.74 ms per token,     7.95 tokens per second)
llama_print_timings:        eval time = 97837.84 ms /   127 runs   (  770.38 ms per token,     1.30 tokens per second)
llama_print_timings:       total time = 132433.96 ms
Llama.generate: prefix-match hit

llama_p

# Create Submissoion

In [26]:
df_responses.columns = ['question_answer', 'reference_document', 'paragraph(s)_number', 'keywords', 'ID', 'Question']

In [27]:
df_responses.head()

Unnamed: 0,question_answer,reference_document,paragraph(s)_number,keywords,ID,Question
0,1. Unexplained deaths - The sudden and unexpla...,TG Booklet 2,133-514,"[unusual, community, time, sick, unusual death]",Q4,"What is the definition of ""unusual event"""
1,1. What is Community Based Surveillance (CBS)?...,TG Booklet 1,224-438,"[Steps for establishing, formal surveillance s...",Q5,What is Community Based Surveillance (CBS)?
2,1. Clinical skills such as recognizing signs a...,TG Booklet 1,456-468,"[reporting suspect cases, health staff, mentor...",Q9,What kind of training should members of VHC re...
3,"Based on the information provided, here is how...",TG Booklet 2,135-398,"[Indicator-Based Surveillance, Surveillance, I...",Q10,What is indicator based surveillance (IBS)?
4,1. Community Based Surveillance (CBS) CBS is a...,TG Booklet 1,81-435,"[Case-based surveillance involves, Case-based ...",Q13,What is Case based surveillance?


In [37]:
df_submission = pd.melt(df_responses, id_vars=['ID'], value_vars=['question_answer', 'reference_document', 'paragraph(s)_number', "keywords"])

In [39]:
df_submission['ID'] = df_submission['ID'] + '_' + df_submission['variable']
df_submission.columns = ["ID", "variable", "Target"]
df_submission = df_submission[['ID', "Target"]].set_index("ID")

In [43]:
df_submission.to_csv(pwd + "/data/submissions/submission_v1.csv")