In [1]:
import pandas as pd
import numpy as np
import pathlib
import torch
from utils.embeddings import Embedder
from utils.preprocess import create_sentance_booklet, create_faise_index
import faiss
from utils.utils import search_content, read_booklets, retrieve_booklet_text, clean_text
from models.llama import llama_cpp
from utils.response_generator import get_response, extract_keyword
from rouge_score import rouge_scorer
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_rows', 1000)


In [3]:
# Silly Mac that forces me to change the environmental variable to prevent issues running transformers
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Read in Data

In [4]:
pwd  = str(pathlib.Path().cwd().parent.resolve())

In [5]:
df_booklet = read_booklets((pwd + "/data/data/booklets/"))
df_train = pd.read_csv(pwd + "/data/data/Train.csv")

# Lets Clean some of the text

In [6]:
df_booklet['cleanText'] = df_booklet['text'].apply(clean_text)

In [7]:
df_booklet['numWords'] = [len(text.split(" ")) for text in df_booklet['text']]

In [8]:
df_booklet_clean = df_booklet[df_booklet['numWords']< 5]

There paragraphs are long, so we might need to consider spliting text on sentences to make them shorter.

It seem like the following steps will have to be taken:

- embed booklet
- embed search phrase
- use search phrase embedding to search for relevant text in booklet
- retrive all relevant text from booklet
- format search phrase and into prompt for LLM
- Send promt to LLM and return answer

# Try simple model
I will first use all-mpnet-base-v2  as the sentance embedder and then I will use phi2 as the LLM .

- Download: huggingface-cli download TheBloke/Llama-2-7b-Chat-GGUF llama-2-7b-chat.Q5_K_M.gguf --local-dir . --local-dir-use-symlinks False

- Then run: pip install llama-cpp-python==0.2.39


1. Embed all sentances

In [9]:
embedding_model = Embedder("sentence-transformers/multi-qa-mpnet-base-dot-v1")

In [10]:
# If you have not create embeddings uncomment below
# booklet_embeddings = embedding_model.embed(df_booklet_clean['cleanText'].values)
#  np.save(file=(pwd + "/data/data/resources/embeddings_cleanv3" ), arr=booklet_embeddings)

In [11]:
# index train answers
# train_embeddings = embedding_model.embed(df_train['Question Answer'].values)
# np.save(file=(pwd + "/data/data/resources/embeddings_train" ), arr=train_embeddings)

2. Creat faiss index for search.

In [12]:
# If you have not create the index uncomment below
# fastIndex = create_faise_index(booklet_embeddings)

# # Save the index
# faiss.write_index(fastIndex, pwd + "/data/data/resources/paragraph_index_cleanv3.faiss")
# df_booklet_clean.to_csv(pwd + "/data/data/resources/bookletv3.csv")

In [13]:
# If you have not create the index uncomment below
# create index fro train
# fastIndex_train = create_faise_index(train_embeddings)

# # Save the index
# faiss.write_index(fastIndex_train, pwd + "/data/data/resources/train_index.faiss")

In [14]:
# Read in index
fastIndex = faiss.read_index( pwd + "/data/data/resources/paragraph_index_cleanv3.faiss")
fastIndex_train = faiss.read_index( pwd + "/data/data/resources/train_index.faiss")

3. Search embeddings and get response

In [15]:
llm_model = llama_cpp(pwd + "/llama-2-7b-chat.Q5_K_M.gguf", gpu=True)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /Users/altasaunders/Alta_projects/zindi_llm/llama-2-7b-chat.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 ll

# Get performance

Uncomment if you want to test on the train data

In [16]:
# df_train = pd.read_csv(pwd +  "/data/data/Train.csv")

In [17]:
# scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

In [18]:
# %%time
# rouge_scores = []
# for question in df_train.head(1)['Question Text'].values.tolist():
#     print("start")
#     df_search_results = search_content(query=question, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=5)
#     response = get_response(text=question, llm=llm_model, df_matches=df_search_results)
#     response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=6)
#     scores = scorer.score(response['answer'], question)
#     rouge_scores.append(scores)


In [19]:
# rouge_scores

# Make Predictions

In [20]:
df_test = pd.read_csv(pwd +  "/data/data/Test.csv")

In [21]:
df_test.shape

(499, 2)

In [44]:
# %%time
responses = []
i = 0
for question in df_test['Question Text'].values.tolist():
    df_book_results, df_question_results = search_content(query=question, df_sentances=df_booklet, df_questions= df_train,
                                       book_index=fastIndex, question_index=fastIndex_train, embedder=embedding_model, k=5)
    response = get_response(text=question, llm=llm_model, df_book_matches=df_book_results, df_question_matches=df_question_results)
    response["keywords"] = extract_keyword(str(df_book_results['text'].values + df_question_results['Question Answer'].values), top_n=5)
    responses.append(response)

df_responses = pd.DataFrame(responses)
df_responses['ID'] = df_test['ID']
df_responses['Question'] = df_test['Question Text']
# df_responses[["Question", "question_answer", "reference_document", "paragraph(s)_number", "keywords", "ID"]].to_csv(pwd + "/data/data/answers.csv")

Llama.generate: prefix-match hit

llama_print_timings:        load time =    3414.05 ms
llama_print_timings:      sample time =      32.15 ms /   118 runs   (    0.27 ms per token,  3670.52 tokens per second)
llama_print_timings: prompt eval time =     527.33 ms /    11 tokens (   47.94 ms per token,    20.86 tokens per second)
llama_print_timings:        eval time =   10666.92 ms /   117 runs   (   91.17 ms per token,    10.97 tokens per second)
llama_print_timings:       total time =   11878.56 ms /   128 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    3414.05 ms
llama_print_timings:      sample time =       0.33 ms /     1 runs   (    0.33 ms per token,  3067.48 tokens per second)
llama_print_timings: prompt eval time =   17275.79 ms /  1742 tokens (    9.92 ms per token,   100.83 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       to

# Create Submissoion

In [None]:
df_responses.columns = ['question_answer', 'reference_document', 'paragraph(s)_number', 'keywords', 'ID', 'Question']

In [None]:
df_responses["question_answer"] = ["Could not retrieve answer" if answer == "" else answer for answer in df_responses["question_answer"]]

In [None]:
df_submission = pd.melt(df_responses, id_vars=['ID'], value_vars=['question_answer', 'reference_document', 'paragraph(s)_number', "keywords"])

In [None]:
df_submission['ID'] = df_submission['ID'] + '_' + df_submission['variable']
df_submission.columns = ["ID", "variable", "Target"]
df_submission = df_submission[['ID', "Target"]].set_index("ID")

In [None]:
df_submission.to_csv(pwd + "/data/submissions/submission_v3.csv")