In [1]:
import pandas as pd
import numpy as np
import pathlib
import torch
from utils.embeddings import Embedder
from utils.preprocess import create_sentance_booklet, create_faise_index
import faiss
from utils.utils import search_content, read_booklets, retrieve_booklet_text
from models.llama import llama_cpp
from utils.response_generator import get_response, extract_keyword
from rouge_score import rouge_scorer
import re
from tqdm import tqdm
import os

In [2]:
# Silly Mac that forces me to change the environmental variable to prevent issues running transformers
# os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Read in Data

In [3]:
pwd  = str(pathlib.Path().cwd().parent.resolve())

In [4]:
df_booklet = read_booklets((pwd + "/data/data/booklets/"))

Since we need to create embeddings of the text, it is important to know how long text is since it can influence the tokenization for some models (can end up truncating text)

In [5]:
df_booklet['textLength'] = [len(text) for text in df_booklet['text']]
print("Maximum Character lenght is: ", df_booklet['textLength'].max())

Maximum Character lenght is:  18747


# Lets Clean some of the text

In [43]:
def clean_text(text):
    # Remove newline characters
    text = text.replace("\n", "")
    
    # Remove non-alphabetic characters and extra spaces
    text = re.sub(r'[^A-Za-z ]+', '', text)

    # Remove tabs
    text = re.sub(r'\\t', '', text)
    
    # Remove specific substring 'BOOKLET ONE'
    text = text.replace('BOOKLET ONE', '')
    
    # Check if the cleaned text is empty or contains only spaces
    if text.isupper():
        text = ''  # Return None for rows to be removed
    
    return text

In [44]:
df_booklet['cleanText'] = df_booklet['text'].apply(clean_text)

In [45]:
df_booklet.head()

Unnamed: 0,index,text,book,textLength,cleanText
1,3,BOOKLET SIX: SECTION 11,booklet6,23,
5,7,TECHNICAL GUIDELINES FOR INTEGRATED DISEASE SU...,booklet6,92,
7,9,BOOKLET SIX: SECTION 11,booklet6,23,
10,12,This booklet comprises the following sections ...,booklet6,193,This booklet comprises the following sections ...
11,13,Table of Contents,booklet6,17,Table of Contents


In [46]:
# Remove empty rows from dataframe
df_booklet = df_booklet[df_booklet['cleanText'].str.strip() != '']

# Remove rows where cleanText length is less than 15 chars
df_booklet = df_booklet[(df_booklet['cleanText'].str.len() >= 15)]

In [47]:
df_booklet.head()

Unnamed: 0,index,text,book,textLength,cleanText
10,12,This booklet comprises the following sections ...,booklet6,193,This booklet comprises the following sections ...
11,13,Table of Contents,booklet6,17,Table of Contents
13,15,ACKNOWLEDGMENTS\tvii,booklet6,19,ACKNOWLEDGMENTSvii
14,16,LIST OF ABBREVIATIONS\tix,booklet6,24,LIST OF ABBREVIATIONSix
16,18,Acute haemorrhagic fever syndrome\t4,booklet6,35,Acute haemorrhagic fever syndrome


In [11]:
df_booklet.to_csv(pwd + "/data/data/resources/booklet_clean.csv")

There paragraphs are long, so we might need to consider spliting text on sentences to make them shorter.

It seem like the following steps will have to be taken:

- embed booklet
- embed search phrase
- use search phrase embedding to search for relevant text in booklet
- retrive all relevant text from booklet
- format search phrase and into prompt for LLM
- Send promt to LLM and return answer

# Try simple model
I will first use all-mpnet-base-v2  as the sentance embedder and then I will use Llama as the LLM .

- Download: `wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin`
- Then run: pip install llama-cpp-python==0.1.78


1. Embed all sentances

In [21]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

TypeError: SentenceTransformer.__init__() got an unexpected keyword argument 'normalize_embeddings'

In [18]:
embeddings_file = pwd + "/data/data/resources/embeddings_clean.npy"
if not os.path.isfile(embeddings_file):
    booklet_embeddings = embedding_model.encode(df_booklet['cleanText'].values, show_progress_bar=True)
    np.save(file=(pwd + "/data/data/resources/embeddings_clean" ), arr=booklet_embeddings)

Batches:   0%|          | 0/161 [00:00<?, ?it/s]

In [12]:
# embedding_model = Embedder("sentence-transformers/multi-qa-mpnet-base-dot-v1")

In [13]:
# If you have not create embeddings uncomment below
# embeddings_file = pwd + "/data/data/resources/embeddings_clean.npy"
# if not os.path.isfile(embeddings_file):
#     booklet_embeddings = embedding_model.embed(df_booklet['cleanText'].values)
#     np.save(file=(pwd + "/data/data/resources/embeddings_clean" ), arr=booklet_embeddings)

2. Creat faiss index for search.

In [19]:
# If you have not create the index uncomment below
index_file = pwd + "/data/data/resources/paragraph_index_clean.faiss"
if not os.path.isfile(index_file):
    fastIndex = create_faise_index(booklet_embeddings)
    faiss.write_index(fastIndex, pwd + "/data/data/resources/paragraph_index_clean.faiss")

In [20]:
# Read in index
fastIndex = faiss.read_index( pwd + "/data/data/resources/paragraph_index_clean.faiss")

In [22]:
query = "What is the man eating?"
query_embedding = embedding_model.encode(query)

In [31]:
def search_content(query, df_sentances, index, embedder, k=5):
    """Function used to to returns relevant text based on query
    
    Arguments:
    ----------
    query: str
            query text

    df_sentances: pandas dataframe
                  data frame with text columns that match index

    index: faiss index
           index of text embeddings
    
    k: int
       top number of matches to return

    embedder: Embedder
              embedding model class
    
    Returns:
    --------
    pandas dataframe: dataframe with text from top matches
    """
    
#     query_vector = embedder.embed(query)
    query_vector = query
    query_vector = np.expand_dims(query_vector, axis=0)

    # We set k to limit the number of vectors we want to return
    matched_em, matched_indexes = index.search(query_vector, k)
    ids = matched_indexes[0][0:k]

    df = df_sentances.iloc[ids.tolist() ]

    return df

In [34]:
search_content(query=query_embedding, embedder=embedding_model, df_sentances=df_booklet, index=fastIndex, k=1)

Unnamed: 0,index,text,book,textLength,cleanText
402,404,Food,booklet1,4,Food


3. Search embeddings and get response

In [16]:
llm_model = llama_cpp(pwd + "/llama-2-7b-chat.ggmlv3.q4_1.bin", gpu=True)

llama.cpp: loading model from /Users/brendentaylor/Documents/zindi_llm/llama-2-7b-chat.ggmlv3.q4_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 5.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 3 (mostly Q4_1)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 4017.35 MB (+ 1024.00 MB per state)
llama_new_context_with_model:

# Get performance

Uncomment if you want to test on the train data

In [17]:
df_train = pd.read_csv(pwd +  "/data/data/Train.csv")

In [18]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

In [19]:
%%time
rouge_scores = []
for question in tqdm(df_train.head(1)['Question Text'].values.tolist()):
    print("start")
    df_search_results = search_content(query=question, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=5)
    response = get_response(text=question, llm=llm_model, df_matches=df_search_results)
    response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=6)
    scores = scorer.score(response['answer'], question)
    rouge_scores.append(scores)


  0%|          | 0/1 [00:00<?, ?it/s]

start




100%|██████████| 1/1 [00:00<00:00,  7.51it/s]
Llama.generate: prefix-match hit


In [18]:
rouge_scores

# Make Predictions

In [19]:
df_test = pd.read_csv(pwd +  "/data/data/Test.csv")

In [27]:
df_booklet.iloc[2, 1]

'￼'

In [20]:
df_test.shape

(499, 2)

In [21]:
df_test[df_test['ID']=='Q1125']

Unnamed: 0,ID,Question Text
451,Q1125,What constitutes a complex emergency?


In [22]:
# %%time
responses = []
i = 0
for question in df_test['Question Text'].values.tolist():
    df_search_results = search_content(query=question, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=5)
    response = get_response(text=question, llm=llm_model, df_matches=df_search_results)
    response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=5)
    responses.append(response)
    i += 1

df_responses = pd.DataFrame(responses)
df_responses['ID'] = df_test['ID']
df_responses['Question'] = df_test['Question Text']
# df_responses[["Question", "question_answer", "reference_document", "paragraph(s)_number", "keywords", "ID"]].to_csv(pwd + "/data/data/answers.csv")


llama_print_timings:        load time = 47539.25 ms
llama_print_timings:      sample time =   111.58 ms /   128 runs   (    0.87 ms per token,  1147.15 tokens per second)
llama_print_timings: prompt eval time = 47539.18 ms /   329 tokens (  144.50 ms per token,     6.92 tokens per second)
llama_print_timings:        eval time = 101783.86 ms /   127 runs   (  801.45 ms per token,     1.25 tokens per second)
llama_print_timings:       total time = 149723.88 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time = 47539.25 ms
llama_print_timings:      sample time =   114.34 ms /   128 runs   (    0.89 ms per token,  1119.45 tokens per second)
llama_print_timings: prompt eval time = 108047.17 ms /   619 tokens (  174.55 ms per token,     5.73 tokens per second)
llama_print_timings:        eval time = 116352.68 ms /   127 runs   (  916.16 ms per token,     1.09 tokens per second)
llama_print_timings:       total time = 224845.18 ms
Llama.generate: prefix-match hit

llam

# Create Submissoion

In [None]:
df_responses.columns = ['question_answer', 'reference_document', 'paragraph(s)_number', 'keywords', 'ID', 'Question']

In [None]:
df_responses["question_answer"] = ["Could not retrieve answer" if answer == "" else answer for answer in df_responses["question_answer"]]

In [None]:
df_submission = pd.melt(df_responses, id_vars=['ID'], value_vars=['question_answer', 'reference_document', 'paragraph(s)_number', "keywords"])

In [None]:
df_submission['ID'] = df_submission['ID'] + '_' + df_submission['variable']
df_submission.columns = ["ID", "variable", "Target"]
df_submission = df_submission[['ID', "Target"]].set_index("ID")

In [None]:
df_submission.to_csv(pwd + "/data/submissions/submission_v2.csv")