In [1]:
import pandas as pd
import numpy as np
import pathlib
import torch
from utils.embeddings import Embedder
from utils.preprocess import create_sentance_booklet, create_faise_index
import faiss
from utils.utils import search_content, read_booklets, retrieve_booklet_text, clean_text
from models.llama import llama_cpp
from utils.response_generator import get_response, extract_keyword, get_paragraph_info
from rouge_score import rouge_scorer
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Silly Mac that forces me to change the environmental variable to prevent issues running transformers
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Read in Data

In [3]:
pwd  = str(pathlib.Path().cwd().parent.resolve())

In [4]:
df_booklet = read_booklets((pwd + "/data/data/booklets/"))
df_train = pd.read_csv(pwd + "/data/data/Train.csv")

# Lets Clean some of the text

In [5]:
df_booklet['cleanText'] = df_booklet['text'].apply(clean_text)

In [6]:
df_booklet['numWords'] = [len(text.split(" ")) for text in df_booklet['text']]

In [7]:
docs = " ".join(df_booklet['text'].values.tolist())

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
all_splits = text_splitter.split_text(docs)

In [9]:
pd.DataFrame({"text":all_splits}).to_csv(pwd + "/data/data/resources/docs.csv")

There paragraphs are long, so we might need to consider spliting text on sentences to make them shorter.

It seem like the following steps will have to be taken:

- embed booklet
- embed search phrase
- use search phrase embedding to search for relevant text in booklet
- retrive all relevant text from booklet
- format search phrase and into prompt for LLM
- Send promt to LLM and return answer

# Try simple model
I will first use all-mpnet-base-v2  as the sentance embedder and then I will use Llama as the LLM .

- Download: `wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin`
- Then run: pip install llama-cpp-python==0.1.78




1. Embed all sentances

In [10]:
embedding_model = Embedder("sentence-transformers/multi-qa-mpnet-base-dot-v1")

In [11]:
# If you have not create embeddings uncomment below
# doc_embeddings = embedding_model.embed(all_splits)
# np.save(file=(pwd + "/data/data/resources/doc_embeddings" ), arr=doc_embeddings)

In [12]:
# index train answers
# train_embeddings = embedding_model.embed(df_train['Question Answer'].values)
# np.save(file=(pwd + "/data/data/resources/embeddings_train" ), arr=train_embeddings)

In [13]:
# If you have not create embeddings uncomment below
# booklet_embeddings = embedding_model.embed(df_booklet_clean['cleanText'].values)
# np.save(file=(pwd + "/data/data/resources/embeddings_cleanv3" ), arr=booklet_embeddings)

2. Creat faiss index for search.

In [14]:
# If you have not create the index uncomment below
# fastIndex = create_faise_index(doc_embeddings)

# # # Save the index
# faiss.write_index(fastIndex, pwd + "/data/data/resources/doc_index.faiss")

In [15]:
# If you have not create the index uncomment below
# create index fro train
# fastIndex_train = create_faise_index(train_embeddings)

# # Save the index
# faiss.write_index(fastIndex_train, pwd + "/data/data/resources/train_index.faiss")

In [16]:
# If you have not create the index uncomment below
# fastIndex = create_faise_index(booklet_embeddings)

# # Save the index
# faiss.write_index(fastIndex, pwd + "/data/data/resources/paragraph_index_cleanv3.faiss")
# df_booklet_clean.to_csv(pwd + "/data/data/resources/bookletv3.csv")

In [17]:
# Read in index
fastIndex_docs = faiss.read_index( pwd + "/data/data/resources/doc_index.faiss")
fastIndex_train = faiss.read_index( pwd + "/data/data/resources/train_index.faiss")
fastIndex_book = faiss.read_index( pwd + "/data/data/resources/paragraph_index_cleanv3.faiss")

In [18]:
df_docs = pd.read_csv(pwd + "/data/data/resources/docs.csv")
df_booklet = pd.read_csv(pwd + "/data/data/resources/bookletv3.csv")

3. Search embeddings and get response

In [19]:
llm_model = llama_cpp(pwd + "/llama-2-7b-chat.ggmlv3.q4_1.bin", gpu=False)

llama.cpp: loading model from /Users/altasaunders/Alta_projects/zindi_llm/llama-2-7b-chat.ggmlv3.q4_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 5.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 3 (mostly Q4_1)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 4017.35 MB (+  256.00 MB per state)
llama_new_context_with_mode

# Make Predictions

In [20]:
df_test = pd.read_csv(pwd +  "/data/data/Test.csv")

In [21]:
df_test.shape

(499, 2)

In [25]:
df_submission = pd.DataFrame(columns=['answer', 'book', 'paragraphs', 'keywords', 'ID', 'Question'])
df_submission.to_csv(pwd + "/data/submissions/submission_v4_temp.csv")

In [26]:
# %%time
responses = []

for index, row in df_test.iterrows():
    question = row['Question Text']
    id = row['ID']
    df_book_results = search_content(query=question, df_sentances=df_docs,
                                       book_index=fastIndex_docs, embedder=embedding_model, k=3)

    response = get_response(text=question, llm=llm_model, df_book_matches=df_book_results)

    book_info = get_paragraph_info(query=question, df_booklet=df_booklet, embedder=embedding_model, fastIndex=fastIndex_book)

    response.update(book_info)
    response["keywords"] = extract_keyword(str(df_book_results['text'].values), top_n=5)
    df_responses = pd.DataFrame([response])
    df_responses['ID'] = id
    df_responses['Question'] = question
    csv_file_path = pwd + "/data/submissions/submission_v4_temp.csv"
    df_responses[['answer', 'book', 'paragraphs', 'keywords', 'ID', 'Question']].to_csv(csv_file_path, mode='a', header=False, index=False)

Llama.generate: prefix-match hit

llama_print_timings:        load time = 64510.40 ms
llama_print_timings:      sample time =    67.87 ms /    78 runs   (    0.87 ms per token,  1149.31 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time = 59224.04 ms /    78 runs   (  759.28 ms per token,     1.32 tokens per second)
llama_print_timings:       total time = 59460.62 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time = 64510.40 ms
llama_print_timings:      sample time =   112.65 ms /   128 runs   (    0.88 ms per token,  1136.25 tokens per second)
llama_print_timings: prompt eval time = 49738.27 ms /   361 tokens (  137.78 ms per token,     7.26 tokens per second)
llama_print_timings:        eval time = 96344.98 ms /   127 runs   (  758.62 ms per token,     1.32 tokens per second)
llama_print_timings:       total time = 146480.30 ms
Llama.gen

In [56]:
df_responses = pd.read_csv(pwd +  "/data/submissions/submission_v4_temp.csv")
df_responses.columns = ['answer', 'book', 'paragraphs', 'keywords', 'ID', 'Question', 'None']

There are a view instances where no prediction was made. We will replace this with a standard response of the text.

In [101]:
def fill_default_answer(row):
    if pd.isna(row['answer']):
        df_book_results = search_content(query=row['Question'], df_sentances=df_docs,
                                       book_index=fastIndex_docs, embedder=embedding_model, k=1)

        paragraph_words = []

        for paragraph in df_book_results['text'].values.tolist():
            paragraph_words += paragraph.split(" ")
            
        booklet_information = " ".join(paragraph_words)

        # remove unicode characters
        unicode_pattern = re.compile(r'[^\x00-\x7F]+')
        cleaned_text = unicode_pattern.sub('', booklet_information)

        answer = f"The answer is in the following text: {cleaned_text}"
        return answer

    else:
        return row['answer']
        

In [102]:
df_responses['final_answer'] = df_responses.apply(fill_default_answer, axis=1)

# Create Submissoion

In [107]:
df_responses

Unnamed: 0,answer,book,paragraphs,keywords,ID,Question,None,final_answer
0,An unusual event refers to any situation that ...,booklet1,313-423,"['acute respiratory infection', 'acute respira...",Q4,"What is the definition of ""unusual event""",,An unusual event refers to any situation that ...
1,CBS is a community-based surveillance system t...,booklet1,20-469,"['Community Based Surveillance', 'subnational ...",Q5,What is Community Based Surveillance (CBS)?,,CBS is a community-based surveillance system t...
2,Training for VHCs should include the following...,booklet3,518-1687,"['high absenteeism', 'absenteeism at school', ...",Q9,What kind of training should members of VHC re...,,Training for VHCs should include the following...
3,Indicator-based surveillance (IBS) is a system...,booklet1,20-400,"['Indicator-based surveillance', 'Surveillance...",Q10,What is indicator based surveillance (IBS)?,,Indicator-based surveillance (IBS) is a system...
4,Case-based surveillance involves ongoing and r...,booklet1,20-345,"['Case-based', 'quarterly or annual', 'Case-ba...",Q13,What is Case based surveillance?,,Case-based surveillance involves ongoing and r...
...,...,...,...,...,...,...,...,...
494,The completeness of surveillance data should b...,booklet4,29-227,"['units submitted reports', 'reporting units s...",Q1229,Where should completeness be evaluated in the ...,,The completeness of surveillance data should b...
495,1. Completeness of recorded data in the patien...,booklet4,29-227,"['units submitted reports', 'reporting units s...",Q1230,Which dimensions of completeness are crucial i...,,1. Completeness of recorded data in the patien...
496,To monitor and improve the completeness of cas...,booklet1,223-345,"['case record unique', 'record unique identifi...",Q1236,How can the completeness of case reporting be ...,,To monitor and improve the completeness of cas...
497,Monitoring the timeliness and completeness of ...,booklet2,609-610,"['disease surveillance systems', 'disease surv...",Q1239,Where should completeness and timeliness of re...,,Monitoring the timeliness and completeness of ...


In [108]:
df_responses['answer'] = df_responses['final_answer']
df_responses.drop(['None', 'final_answer'], inplace=True, axis=1)

In [110]:
df_responses.columns = ['question_answer', 'reference_document', 'paragraph(s)_number', 'keywords', 'ID', 'Question']

In [111]:
df_submission = pd.melt(df_responses, id_vars=['ID'], value_vars=['question_answer', 'reference_document', 'paragraph(s)_number', "keywords"])

In [112]:
df_submission['ID'] = df_submission['ID'] + '_' + df_submission['variable']
df_submission.columns = ["ID", "variable", "Target"]
df_submission = df_submission[['ID', "Target"]].set_index("ID")

In [114]:
df_submission.to_csv(pwd + "/data/submissions/submission_v4.csv")