In [1]:
import pandas as pd
import numpy as np
import pathlib
import torch
from utils.embeddings import Embedder
from utils.preprocess import create_sentance_booklet, create_faise_index
import faiss
from utils.utils import search_content, read_booklets, retrieve_booklet_text, clean_text, reformat_abbreviations
from models.ollama import Ollama
from utils.response_generator import get_response, extract_keyword, get_paragraph_info
from rouge_score import rouge_scorer
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Silly Mac that forces me to change the environmental variable to prevent issues running transformers
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Read in Data

In [3]:
pwd  = str(pathlib.Path().cwd().parent.resolve())

In [4]:
df_booklet = read_booklets((pwd + "/data/data/booklets/"))
df_train = pd.read_csv(pwd + "/data/data/Train.csv")

# Lets Clean some of the text

In [5]:
df_booklet['cleanText'] = df_booklet['text'].apply(clean_text)

# Clean data
We will refromat the abbreviation text. We will then also remove any other unicode characters.

In [6]:
abbreviations = df_booklet[df_booklet['text'].str.lstrip().str.startswith("AAR")].index

In [7]:
for indx in abbreviations:
    abrv_dict = reformat_abbreviations(df_booklet.iloc[indx]['text'])
    new_abrv_text = [(key +" : "+ value) for key, value in  abrv_dict.items()]
    abrv_df = pd.DataFrame({"text": new_abrv_text,
                            "cleanText": new_abrv_text})
    abrv_df['index'] = df_booklet.iloc[indx]['index']
    abrv_df['book'] = df_booklet.iloc[indx]['book']
    df_booklet = pd.concat([df_booklet, abrv_df[df_booklet.drop("level_0", axis=1).columns]])
    

In [8]:
# remove abbreviation indexes
df_booklet = df_booklet.drop(abbreviations)

In [9]:
# remove all empty strings
df_booklet = df_booklet[df_booklet['cleanText'] != " "]

In [10]:
docs = " ".join(df_booklet['cleanText'].values.tolist())

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
all_splits = text_splitter.split_text(docs)

In [12]:
pd.DataFrame({"text":all_splits}).to_csv(pwd + "/data/data/resources/docs.csv")

It seem like the following steps will have to be taken:

- embed booklet
- embed search phrase
- use search phrase embedding to search for relevant text in booklet
- retrive all relevant text from booklet
- format search phrase and into prompt for LLM
- Send promt to LLM and return answer

# Try simple model


1. Embed all sentances

In [13]:
embedding_model = Embedder("sentence-transformers/multi-qa-mpnet-base-dot-v1")

In [14]:
# If you have not create embeddings uncomment below
# doc_embeddings = embedding_model.embed(all_splits)
# np.save(file=(pwd + "/data/data/resources/doc_embeddings" ), arr=doc_embeddings)

In [15]:
# If you have not create embeddings uncomment below
# booklet_embeddings = embedding_model.embed(df_booklet['cleanText'].values)
# np.save(file=(pwd + "/data/data/resources/embeddings_cleanv3" ), arr=booklet_embeddings)

2. Creat faiss index for search.

In [16]:
# If you have not create the index uncomment below
# fastIndex = create_faise_index(doc_embeddings)

# # # Save the index
# faiss.write_index(fastIndex, pwd + "/data/data/resources/doc_index.faiss")

In [17]:
# If you have not create the index uncomment below
# fastIndex = create_faise_index(booklet_embeddings)

# # Save the index
# faiss.write_index(fastIndex, pwd + "/data/data/resources/paragraph_index_cleanv3.faiss")

In [18]:
# Read in index
fastIndex_docs = faiss.read_index( pwd + "/data/data/resources/doc_index.faiss")
fastIndex_book = faiss.read_index( pwd + "/data/data/resources/paragraph_index_cleanv3.faiss")

In [19]:
df_docs = pd.read_csv(pwd + "/data/data/resources/docs.csv")

3. Search embeddings and get response

# Ollama
To use ollama install follow these instructions:

- Download and install Ollama onto the available supported platforms (including Windows Subsystem for Linux) (https://ollama.com/)
- Fetch available LLM model via ollama pull llama2

This will download the default tagged version of the model. Typically, the default points to the latest, smallest sized-parameter model.

In [20]:
llm_model = Ollama(model="phi", gpu=True)

# Measure Performance

In [21]:
df_train = pd.read_csv(pwd +  "/data/data/Train.csv")

In [23]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

In [35]:
rouge_scores = []
for index, row in df_train.tail(30).iterrows():
    question = row['Question Text']
    df_search_results = search_content(query=question, df_sentances=df_docs, book_index=fastIndex_docs, embedder=embedding_model, k=5)
    response = get_response(text=question, llm=llm_model, df_book_matches=df_search_results, text_column="text")
    response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=3)
    scores = scorer.score(response['answer'], row['Question Answer'])
    rouge_scores.append(scores['rouge1'][-1])

In [36]:
np.mean(rouge_scores)

0.3660295948089137

# Make Predictions

In [37]:
df_test = pd.read_csv(pwd +  "/data/data/Test.csv")

In [38]:
df_submission = pd.DataFrame(columns=['answer', 'book', 'paragraphs', 'keywords', 'ID', 'Question'])
df_submission.to_csv(pwd + "/data/submissions/submission_v6_temp.csv")

In [40]:
# %%time
responses = []

for index, row in df_test.iterrows():
    question = row['Question Text']
    id = row['ID']
    df_search_results = search_content(query=question, df_sentances=df_docs, book_index=fastIndex_docs, embedder=embedding_model, k=5)
    response = get_response(text=question, llm=llm_model, df_book_matches=df_search_results, text_column="text")
    book_info = get_paragraph_info(query=question, df_booklet=df_booklet, embedder=embedding_model, fastIndex=fastIndex_book)
    response.update(book_info)
    response["keywords"] = extract_keyword(str(df_search_results['text'].values), top_n=5)
    df_responses = pd.DataFrame([response])
    df_responses['ID'] = id
    df_responses['Question'] = question
    csv_file_path = pwd + "/data/submissions/submission_v6_temp.csv"
    df_responses[['answer', 'book', 'paragraphs', 'keywords', 'ID', 'Question']].to_csv(csv_file_path, mode='a', header=False, index=False)

In [41]:
df_responses = pd.read_csv(pwd +  "/data/submissions/submission_v6_temp.csv")
df_responses.columns = ['answer', 'book', 'paragraphs', 'keywords', 'ID', 'Question', 'None']

# Create Submissoion

In [42]:
df_responses.drop(['None'], inplace=True, axis=1)

In [45]:
df_responses.columns = ['question_answer', 'reference_document', 'paragraph(s)_number', 'keywords', 'ID', 'Question']

In [46]:
df_submission = pd.melt(df_responses, id_vars=['ID'], value_vars=['question_answer', 'reference_document', 'paragraph(s)_number', "keywords"])

In [47]:
df_submission['ID'] = df_submission['ID'] + '_' + df_submission['variable']
df_submission.columns = ["ID", "variable", "Target"]
df_submission = df_submission[['ID', "Target"]].set_index("ID")

In [49]:
df_submission.to_csv(pwd + "/data/submissions/submission_v6.csv")