In [1]:
import pandas as pd
import numpy as np
import pathlib
import torch
from utils.embeddings import Embedder
from utils.preprocess import create_sentance_booklet, create_faise_index
import faiss
from utils.utils import search_content, read_booklets, retrieve_booklet_text, clean_text, reformat_abbreviations
from models.ollama import Ollama
from utils.response_generator import get_response, extract_keyword, get_paragraph_info, find_matching_paragraphs
from rouge_score import rouge_scorer
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import os
import sentence_transformers
from utils.vector_store import create_chunks, embed_booklets
from utils.postprocessing import clean_book_info, format_keywords, first_letter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Silly Mac that forces me to change the environmental variable to prevent issues running transformers
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Read in Data

In [3]:
pwd  = str(pathlib.Path().cwd().parent.resolve())

In [4]:
df_booklet = read_booklets((pwd + "/data/data/booklets/"))
df_train = pd.read_csv(pwd + "/data/data/Train.csv")

# Lets Clean some of the text

In [5]:
df_booklet['cleanText'] = df_booklet['text'].apply(clean_text)

# Clean data
We will refromat the abbreviation text. We will then also remove any other unicode characters.

In [6]:
abbreviations = df_booklet[df_booklet['text'].str.lstrip().str.startswith("AAR")].index

In [7]:
for indx in abbreviations:
    abrv_dict = reformat_abbreviations(df_booklet.iloc[indx]['text'])
    new_abrv_text = [(key +" : "+ value) for key, value in  abrv_dict.items()]
    abrv_df = pd.DataFrame({"text": new_abrv_text,
                            "cleanText": new_abrv_text})
    abrv_df['index'] = df_booklet.iloc[indx]['index']
    abrv_df['book'] = df_booklet.iloc[indx]['book']
    df_booklet = pd.concat([df_booklet, abrv_df[df_booklet.drop("level_0", axis=1).columns]])
    

In [8]:
# remove abbreviation indexes
df_booklet = df_booklet.drop(abbreviations)

In [9]:
# remove all empty strings
df_booklet = df_booklet[df_booklet['cleanText'] != " "]

In [10]:
df_booklet = df_booklet.drop(['level_0', 'text'], axis=1)
df_booklet.columns = ['paragraph',  'book', 'cleanText']

# Embed and Create Vector store

In [11]:
df_booklet['cleanText'] = [x+" " for x in df_booklet['cleanText']]
df_booklet.sort_values(by=['book', 'paragraph'], inplace=True)
docs = "".join(df_booklet['cleanText'].values.tolist())

In [12]:
df_booklet['len'] = [len(text) for text in df_booklet['cleanText'].astype(str)]

In [13]:
# get new splits 
sum = 0
counter = 0
splits = []

for x in df_booklet['len']:
    sum = sum + x
    if sum == 500:
        splits.append(counter)
        counter = counter + 1
        sum = 0
    elif sum > 500:
        counter = counter + 1
        splits.append(counter)
        sum = sum - 500
    else:
        splits.append(counter)


df_booklet['group'] = splits

In [14]:
books = df_booklet.groupby('group')['book'].agg(lambda x: list(set(x)))
paragraphs = df_booklet.groupby('group')['paragraph'].agg(lambda x: sorted(list(set(x))))
groups = list(set(df_booklet.group.values))
text = df_booklet.groupby('group')['cleanText'].agg(lambda x: "".join(list(x)))

df_booklet_new = pd.DataFrame({"books": books, "paragraphs": paragraphs, "group": groups, "cleanText":text}).reset_index(drop=True)

Now lets reformat the paragraphs. If they all appear in the same book we will only report the min and max.

In [15]:
df_booklet_new['paragraphs'] = df_booklet_new.apply(lambda row: ",".join([str(x) for x in row['paragraphs']]) if len(row['books']) > 1 else
                                                                str(row['paragraphs'][0]) if (len(row['books']) == 1) and (len(row['paragraphs']) == 1)
                                                                else "-".join([str(row["paragraphs"][0]), str(row["paragraphs"][-1])]), axis=1)

In [16]:
df_booklet_new['books'] = df_booklet_new.apply(lambda row: ",".join(list(set(row['books']))), axis=1)

In [17]:
vector_store = embed_booklets(df_booklet_new,recreate_embeddings=True)

It seem like the following steps will have to be taken:

- embed booklet
- embed search phrase
- use search phrase embedding to search for relevant text in booklet
- retrive all relevant text from booklet
- format search phrase and into prompt for LLM
- Send promt to LLM and return answer

# Ollama
To use ollama install follow these instructions:

- Download and install Ollama onto the available supported platforms (including Windows Subsystem for Linux) (https://ollama.com/)
- Fetch available LLM model via ollama pull llama2 or ollama pull phi

This will download the default tagged version of the model. Typically, the default points to the latest, smallest sized-parameter model.

In [18]:
llm_model = Ollama(model="phi", gpu=True)

# Measure Performance

In [19]:
df_train = pd.read_csv(pwd +  "/data/data/Train.csv")

In [20]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

In [52]:
rouge_scores = []
responses = []
for index, row in df_train.head(5).iterrows():
    question = row['Question Text']
    docs = vector_store.similarity_search_with_score(question, k=2)
    booklet_matches = [doc[0].page_content for doc in docs]
    response = get_response(text=question, llm=llm_model, booklet_matches=booklet_matches, text_column="text")
    keywords = llm_model.generate(f"Generate keywords form the following text {response['answer']}")
    book_docs = vector_store.similarity_search(response['answer'], k=1)

    response["book"] = "TG Booklet " + book_docs[0].metadata["books"][-1]
    response["paragraph"] = book_docs[0].metadata["paragraphs"]
    response['keywords'] = format_keywords(keywords)
    responses.append(response)

    scores = scorer.score(response['answer'], row['Question Answer'])
    rouge_scores.append(scores['rouge1'][-1])
    scores = scorer.score(response['keywords'], row['Keywords'])
    rouge_scores.append(scores['rouge1'][-1])
    scores = scorer.score(response['book'], row['Reference Document'])
    rouge_scores.append(scores['rouge1'][-1])
    scores = scorer.score(response['paragraph'], row['Paragraph(s) Number'])
    rouge_scores.append(scores['rouge1'][-1])

# Make Predictions

In [None]:
df_test = pd.read_csv(pwd +  "/data/data/Test.csv")

In [None]:
df_submission = pd.DataFrame(columns=['answer', 'book', 'paragraphs', 'keywords', 'ID', 'Question'])
df_submission.to_csv(pwd + "/data/submissions/submission_v9_temp.csv")

In [None]:
# %%time
responses = []

for index, row in df_test.iterrows():
    id = row['ID']
    question = row['Question Text']
    docs = vector_store.similarity_search_with_score(question, k=2)
    booklet_matches = [doc[0].page_content for doc in docs]
    response = get_response(text=question, llm=llm_model, booklet_matches=booklet_matches, text_column="text")
    keywords = llm_model.generate(f"Generate keywords form the following text {response['answer']}")
    book_docs = vector_store.similarity_search(response['answer'], k=1)

    response["book"] = "TG Booklet " + book_docs[0].metadata["books"][-1]
    response["paragraph"] = book_docs[0].metadata["paragraphs"]
    response['keywords'] = format_keywords(keywords)
    df_responses = pd.DataFrame([response])
    df_responses['ID'] = id
    df_responses['Question'] = question
    csv_file_path = pwd + "/data/submissions/submission_v9_temp.csv"
    df_responses[['answer', 'book', 'paragraph', 'keywords', 'ID', 'Question']].to_csv(csv_file_path, mode='a', header=False, index=False)

In [None]:
df_responses = pd.read_csv(pwd +  "/data/submissions/submission_v9_temp.csv")
df_responses.columns = ['answer', 'book', 'paragraphs', 'keywords', 'ID', 'Question', 'None']

# Create Submissoion

In [None]:
df_responses.drop(['None'], inplace=True, axis=1)

In [None]:
df_responses.columns = ['question_answer', 'reference_document', 'paragraph(s)_number', 'keywords', 'ID', 'Question']

In [None]:
df_submission = pd.melt(df_responses, id_vars=['ID'], value_vars=['question_answer', 'reference_document', 'paragraph(s)_number', "keywords"])

In [None]:
df_submission['ID'] = df_submission['ID'] + '_' + df_submission['variable']
df_submission.columns = ["ID", "variable", "Target"]
df_submission = df_submission[['ID', "Target"]].set_index("ID")

In [None]:
df_submission.to_csv(pwd + "/data/submissions/submission_v9.csv")