In [1]:
import pandas as pd
import numpy as np
import pathlib
import torch
from utils.embeddings import Embedder
from utils.preprocess import create_sentance_booklet, create_faise_index
import faiss
from utils.utils import search_content, read_booklets, retrieve_booklet_text
from models.llama import llama
from utils.response_generator import get_response, extract_keyword
from rouge_score import rouge_scorer



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Read in Data

In [3]:
pwd  = str(pathlib.Path().cwd().parent.resolve())

In [4]:
df_booklet = read_booklets((pwd + "/data/data/booklets/"))

Since we need to create embeddings of the text, it is important to know how long text is since it can influence the tokenization for some models (can end up truncating text)

In [5]:
df_booklet['textLength'] = [len(text) for text in df_booklet['text']]
print("Maximum Character lenght is: ", df_booklet['textLength'].max())

Maximum Character lenght is:  18747


There paragraphs are long, so we might need to consider spliting text on sentences to make them shorter.

It seem like the following steps will have to be taken:

- embed booklet
- embed search phrase
- use search phrase embedding to search for relevant text in booklet
- retrive all relevant text from booklet
- format search phrase and into prompt for LLM
- Send promt to LLM and return answer

# Try simple model
I will first use all-mpnet-base-v2  as the sentance embedder and then I will use Llama as the LLM .

- Download: `wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin`
- Then run: pip install llama-cpp-python==0.1.78


1. Embed all sentances

In [6]:
embedding_model = Embedder("sentence-transformers/multi-qa-mpnet-base-dot-v1")

In [7]:
# If you have not create embeddings uncomment below
# booklet_embeddings = embedding_model.embed(df_booklet['text'].values)

In [8]:
#  np.save(file=(pwd + "/data/data/resources/embeddings" ), arr=booklet_embeddings)

2. Creat faiss index for search.

In [9]:
# If you have not create the index uncomment below
# fastIndex = create_faise_index(booklet_embeddings)

In [10]:
# Save the index
# faiss.write_index(fastIndex, pwd + "/data/data/resources/paragraph_index.faiss")
# df_booklet.to_csv(pwd + "/data/data/resources/booklet.csv")


In [11]:
# Read in index
fastIndex = faiss.read_index( pwd + "/data/data/resources/paragraph_index.faiss")

3. Search embeddings and get response

In [12]:
llm_model = llama(pwd + "/llama-2-7b-chat.ggmlv3.q4_1.bin")

llama.cpp: loading model from /Users/altasaunders/Alta_projects/zindi_llm/llama-2-7b-chat.ggmlv3.q4_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 5.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 3 (mostly Q4_1)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 4017.35 MB (+  256.00 MB per state)
llama_new_context_with_mode

In [None]:
df_search_results = search_content(query=text, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=5)

In [None]:
response = get_response(text=text, llm=llm_model, df_matches=df_search_results)

4. Find keywords

In [None]:
response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=6)

# Get performance

In [None]:
df_train = pd.read_csv(pwd +  "/data/data/Train.csv")

In [None]:
df_train.shape

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

In [None]:
%%time
rouge_scores = []
for question in df_train.head(5)['Question Text'].values.tolist():
    print("start")
    df_search_results = search_content(query=question, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=5)
    response = get_response(text=question, llm=llm_model, df_matches=df_search_results)
    response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=6)
    scores = scorer.score(response['answer'], question)
    rouge_scores.append(scores)


In [None]:
rouge_scores

# Make Predictions

In [13]:
df_test = pd.read_csv(pwd +  "/data/data/Test.csv")

In [14]:
df_test.shape

(499, 2)

In [28]:
%%time
responses = []
for question in df_test.head(1)['Question Text'].values.tolist():
    print("start")
    df_search_results = search_content(query=question, df_sentances=df_booklet, index=fastIndex, embedder=embedding_model, k=3)
    response = get_response(text=question, llm=llm_model, df_matches=df_search_results)
    response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=5)
    responses.append(response)


start


Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit


In [None]:
response

{'answer': ' If so, how does this impact your analysis and conclusions?\n                    \n                    The dataset contains the number of times certain types of events occur in a given time period. For example, the data may include the number of accidents that occurred at a particular intersection, or the number of customer complaints received by a company over a certain period of time.\n                    \n                    To analyze the data, you will need to identify patterns and trends in the data, and use this information to draw conclusions about the underlying causes of the events. For example, if you are analyzing the number of accidents at',
 'book': 'TG Booklet 2',
 'Paragraph': '133-514',
 'keywords': ['unusual', 'community', 'time', 'sick', 'unusual death']}

In [17]:
df_test.head(1)['Question Text']

0    What is the definition of "unusual event"
Name: Question Text, dtype: object