In [2]:
import pandas as pd
import numpy as np
import pathlib
import torch
from utils.embeddings import Embedder
from utils.preprocess import create_sentance_booklet, create_faise_index
import faiss
from utils.utils import search_content, read_booklets, retrieve_booklet_text, clean_text, reformat_abbreviations
from models.ollama import Ollama
from utils.response_generator import get_response, extract_keyword, get_paragraph_info, find_matching_paragraphs
from rouge_score import rouge_scorer
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import os
import sentence_transformers

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Silly Mac that forces me to change the environmental variable to prevent issues running transformers
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Read in Data

In [4]:
pwd  = str(pathlib.Path().cwd().parent.resolve())

In [5]:
df_booklet = read_booklets((pwd + "/data/data/booklets/"))
df_train = pd.read_csv(pwd + "/data/data/Train.csv")

# Lets Clean some of the text

In [6]:
df_booklet['cleanText'] = df_booklet['text'].apply(clean_text)

# Clean data
We will refromat the abbreviation text. We will then also remove any other unicode characters.

In [7]:
abbreviations = df_booklet[df_booklet['text'].str.lstrip().str.startswith("AAR")].index

In [8]:
for indx in abbreviations:
    abrv_dict = reformat_abbreviations(df_booklet.iloc[indx]['text'])
    new_abrv_text = [(key +" : "+ value) for key, value in  abrv_dict.items()]
    abrv_df = pd.DataFrame({"text": new_abrv_text,
                            "cleanText": new_abrv_text})
    abrv_df['index'] = df_booklet.iloc[indx]['index']
    abrv_df['book'] = df_booklet.iloc[indx]['book']
    df_booklet = pd.concat([df_booklet, abrv_df[df_booklet.drop("level_0", axis=1).columns]])
    

In [9]:
# remove abbreviation indexes
df_booklet = df_booklet.drop(abbreviations)

In [10]:
# remove all empty strings
df_booklet = df_booklet[df_booklet['cleanText'] != " "]

In [11]:
df_booklet = df_booklet.drop(['level_0', 'text'], axis=1)
df_booklet.columns = ['paragraph',  'book', 'cleanText']

# Embed and Create Vector store

In [12]:
def create_chunks(dataset: pd.DataFrame, chunk_size: int=500, chunk_overlap: int=10) -> list:
    """
    Create chunks from the dataset

    Args:
        dataset (pd.DataFrame): Dataset
        chunk_size (int): Chunk size
        chunk_overlap (int): Chunk overlap

    Returns:
        list: List of chunks
    """
    text_chunks = DataFrameLoader(
        dataset, page_content_column="cleanText"
    ).load_and_split(
        text_splitter=RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
        )
    )

    return text_chunks


def create_or_get_vector_store(chunks: list, recreate_embeddings=False) -> FAISS:
    """
    Create or get vector store

    Args:
        chunks (list): List of chunks

    Returns:
        FAISS: Vector store
    """

    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1"
    )
    if not os.path.exists("./db") or recreate_embeddings:
        print("CREATING DB")
        vectorstore = FAISS.from_documents(chunks, embeddings)
        vectorstore.save_local("./db")
    else:
        # print("LOADING DB")
        vectorstore = FAISS.load_local("./db", embeddings)

    return vectorstore


def embed_booklets(df, recreate_embeddings=False):
    chunks = create_chunks(df, 2000, 0)
    vector_store = create_or_get_vector_store(chunks, recreate_embeddings=recreate_embeddings)

    return vector_store

In [13]:
df_booklet['cleanText'] = [x+" " for x in df_booklet['cleanText']]
df_booklet.sort_values(by=['book', 'paragraph'], inplace=True)
docs = "".join(df_booklet['cleanText'].values.tolist())

In [14]:
df_booklet['len'] = [len(text) for text in df_booklet['cleanText'].astype(str)]

In [15]:
# get new splits 
sum = 0
counter = 0
splits = []

for x in df_booklet['len']:
    sum = sum + x
    if sum == 500:
        splits.append(counter)
        counter = counter + 1
        sum = 0
    elif sum > 500:
        counter = counter + 1
        splits.append(counter)
        sum = sum - 500
    else:
        splits.append(counter)


df_booklet['group'] = splits

In [16]:
books = df_booklet.groupby('group')['book'].agg(lambda x: list(set(x)))
paragraphs = df_booklet.groupby('group')['paragraph'].agg(lambda x: sorted(list(set(x))))
groups = list(set(df_booklet.group.values))
text = df_booklet.groupby('group')['cleanText'].agg(lambda x: "".join(list(x)))

df_booklet_new = pd.DataFrame({"books": books, "paragraphs": paragraphs, "group": groups, "cleanText":text}).reset_index(drop=True)

In [48]:
df_booklet_new.head()['cleanText'].values

array(['THIRD EDITION BOOKLET ONE: INTRODUCTION SECTION  DECEMBER 2020 TECHNICAL GUIDELINES FOR INTEGRATED DISEASE SURVEILLANCE AND RESPONSE IN MALAWI THIRD EDITION BOOKLET ONE: INTRODCUTION SECTION This booklet introduces all the eleven modules of the 3rd Edition Malawi Integrated Disease Surveillance and Response Technical Guidelines DECEMBER, 2020 Table of Contents  FOREWORD iv ACKNOWLEDGEMENTS vi LIST OF ABBREVIATIONS vii GLOSSARY (DEFINITION OF KEY TERMS) x 1.1 Introduction 1 ',
       '1.2 Public Health Surveillance 1 1.2.1 Definition of the different typesapproaches of public health surveillance 2 1.2.1.1 Indicator-based surveillance 2 1.2.1.2 Event- Based Surveillance (EBS) 5 1.2.3 Event- based surveillance (EBS) and Indicator-Based Surveillance (IBS) as back-bone to the IDSR Strategy 8 1.3 Integrated Disease Surveillance and Response strategy 9 1.3.1 Objectives of Integrated Disease Surveillance and Response 11 1.4 IDSR and IHR (2005) 12 1.4.1 IHR 2005 purpose and goal 12 ',
 

Now lets reformat the paragraphs. If they all appear in the same book we will only report the min and max.

In [17]:
df_booklet_new['paragraphs'] = df_booklet_new.apply(lambda row: ",".join([str(x) for x in row['paragraphs']]) if len(row['books']) > 1 else
                                                                str(row['paragraphs'][0]) if (len(row['books']) == 1) and (len(row['paragraphs']) == 1)
                                                                else "-".join([str(row["paragraphs"][0]), str(row["paragraphs"][-1])]), axis=1)

In [18]:
df_booklet_new['books'] = df_booklet_new.apply(lambda row: ",".join(list(set(row['books']))), axis=1)

In [19]:
vector_store = embed_booklets(df_booklet_new,recreate_embeddings=True)

CREATING DB


It seem like the following steps will have to be taken:

- embed booklet
- embed search phrase
- use search phrase embedding to search for relevant text in booklet
- retrive all relevant text from booklet
- format search phrase and into prompt for LLM
- Send promt to LLM and return answer

# Ollama
To use ollama install follow these instructions:

- Download and install Ollama onto the available supported platforms (including Windows Subsystem for Linux) (https://ollama.com/)
- Fetch available LLM model via ollama pull llama2

This will download the default tagged version of the model. Typically, the default points to the latest, smallest sized-parameter model.

In [20]:
llm_model = Ollama(model="phi", gpu=True)

# Measure Performance

In [21]:
df_train = pd.read_csv(pwd +  "/data/data/Train.csv")

In [22]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

In [23]:
import re

In [24]:
def first_letter(s):
    m = re.search(r'[a-z]', s, re.I)
    if m is not None:
        return m.start()
    return -1

In [25]:
def format_keywords(keywords:str):
    keywords_list = keywords.split("Keywords:")[-1].split(',')
    if len(keywords_list) == 1:
        keywords_list = keywords.split("Keywords:")[-1].split('\n')
    keywords_indexes = [first_letter(word) for word in keywords_list]
    clean_keywords = [keywords_list[i][keywords_indexes[i]:] for i in range(len(keywords_list))]
    parsed_keywords = ", ".join(list(set([i.lstrip().capitalize().replace('\n', "") for i in clean_keywords])))
    return parsed_keywords


In [26]:
def clean_book_info(book_info: dict):
    if len(set(book_info['book'].split(","))) > 1:
        book_info['book'] = book_info['book'].split(",")[0]
        book_info['paragraph'] = book_info['paragraph'].split(",")[0]

    else: 
        book_info['book'] = book_info['book'].split(",")[0]
    return book_info

In [38]:
rouge_scores = []
responses = []
for index, row in df_train.head(20).iterrows():
    question = row['Question Text']
    docs = vector_store.similarity_search_with_score(question, k=2)
    booklet_matches = [doc[0].page_content for doc in docs]
    response = get_response(text=question, llm=llm_model, booklet_matches=booklet_matches, text_column="text")
    keywords = llm_model.generate(f"Generate keywords form the following text {response['answer']}")
    book_docs = vector_store.similarity_search(response['answer'], k=1)

    response["book"] = "TG Booklet " + book_docs[0].metadata["books"][-1]
    response["paragraph"] = book_docs[0].metadata["paragraphs"]
    response['keywords'] = format_keywords(keywords)
    responses.append(response)

    scores = scorer.score(response['answer'], row['Question Answer'])
    rouge_scores.append(scores['rouge1'][-1])
    scores = scorer.score(response['keywords'], row['Keywords'])
    rouge_scores.append(scores['rouge1'][-1])
    scores = scorer.score(response['book'], row['Reference Document'])
    rouge_scores.append(scores['rouge1'][-1])
    scores = scorer.score(response['paragraph'], row['Paragraph(s) Number'])
    rouge_scores.append(scores['rouge1'][-1])

In [44]:
df_train.head(10)

Unnamed: 0,ID,Question Text,Question Answer,Reference Document,Paragraph(s) Number,Keywords
0,Q829,Compare the laboratory confirmation methods fo...,Chikungunya is confirmed using serological tes...,TG Booklet 6,"154, 166",Laboratory Confirmation For Chikungunya Vs. Di...
1,Q721,When should specimens be collected for Anthrax...,Specimens should be collected during the vesic...,TG Booklet 6,140,"Anthrax Specimen Collection: Timing, Preparati..."
2,Q464,Which key information should be recorded durin...,"During a register review, key information abou...",TG Booklet 3,439-440,"Register Review, Key Information, Suspected Ca..."
3,Q449,Why is the District log of suspected outbreaks...,The log includes information about response ac...,TG Booklet 3,412,"District Log, Response Activities, Steps Taken..."
4,Q6,What do Community based surveillance strategie...,Community-based surveillance strategies focus ...,TG Booklet 1,86,"Community-based Surveillance Strategies, Ident..."
5,Q451,Compare and contrast the District log of suspe...,The District log (Annex 4A) focuses on recordi...,TG Booklet 3,"411, 414","District Log, Laboratory Supplies Checklist, P..."
6,Q337,How can the laboratory results be reviewed dur...,Review laboratory results with the investigati...,TG Booklet 3,294,"Laboratory Results Review, Team, Health Facili..."
7,Q113,How should the One Health approach be implemen...,Emphasize the implementation of the One Health...,TG Booklet 2,652,"One Health Approach Implementation, Reporting,..."
8,Q1160,How does the role of a coordinator at the nati...,The national level coordinator provides overar...,TG Booklet 5,"128, 137-142","National Level Coordinator, District Level Foc..."
9,Q188,Can you provide an example of a district spot ...,"Yes, Figure 3.6 shows an example of a district...",TG Booklet 2,817,"Example Of District Spot Map, Location Of Susp..."


In [49]:
responses

[{'answer': 'The laboratory confirmation methods for Chikungunya are based on detecting the presence of the virus in blood samples. Diabetes is diagnosed through blood glucose measurements. Both diseases can be confirmed by laboratory tests, but they have different diagnostic criteria and outcomes. ',
  'book': 'TG Booklet 6',
  'paragraph': '155',
  'keywords': 'Blood samples, Chikungunya, Diagnostic criteria, Diabetes, Blood glucose measurements, Outcomes., Laboratory confirmation'},
 {'answer': 'Specimens should be collected from any patient being evaluated for cutaneous Bacillus anthracis infection. It may not be possible to demonstrate B.anthracis in clinical specimens if the patient has been treated with antimicrobial agents. Organism is best demonstrated in specimen taken at the vesicular stage. Vesicular stage: collect fluid from intact vesicles on sterile swabs. Eschar stage: without removing eschar, insert swab beneath the edge of eschar, rotate and collect lesion material. S

In [39]:
np.mean(rouge_scores)

0.4724537893321026

# Make Predictions

In [29]:
df_test = pd.read_csv(pwd +  "/data/data/Test.csv")

In [30]:
df_submission = pd.DataFrame(columns=['answer', 'book', 'paragraphs', 'keywords', 'ID', 'Question'])
df_submission.to_csv(pwd + "/data/submissions/submission_v8_temp.csv")

In [31]:
# %%time
responses = []

for index, row in df_test.iterrows():
    id = row['ID']
    question = row['Question Text']
    docs = vector_store.similarity_search_with_score(question, k=3)
    booklet_matches = [doc[0].page_content for doc in docs]
    response = get_response(text=question, llm=llm_model, booklet_matches=booklet_matches, text_column="text")
    keywords = llm_model.generate(f"Generate keywords form the following text {response['answer']}")
    book_docs = vector_store.similarity_search(response['answer'], k=1)

    response["book"] = "TG Booklet " + book_docs[0].metadata["books"][-1]
    response["paragraph"] = book_docs[0].metadata["paragraphs"]
    response['keywords'] = format_keywords(keywords)
    df_responses = pd.DataFrame([response])
    df_responses['ID'] = id
    df_responses['Question'] = question
    csv_file_path = pwd + "/data/submissions/submission_v8_temp.csv"
    df_responses[['answer', 'book', 'paragraph', 'keywords', 'ID', 'Question']].to_csv(csv_file_path, mode='a', header=False, index=False)

In [32]:
df_responses = pd.read_csv(pwd +  "/data/submissions/submission_v8_temp.csv")
df_responses.columns = ['answer', 'book', 'paragraphs', 'keywords', 'ID', 'Question', 'None']

# Create Submissoion

In [33]:
df_responses.drop(['None'], inplace=True, axis=1)

In [34]:
df_responses.columns = ['question_answer', 'reference_document', 'paragraph(s)_number', 'keywords', 'ID', 'Question']

In [35]:
df_submission = pd.melt(df_responses, id_vars=['ID'], value_vars=['question_answer', 'reference_document', 'paragraph(s)_number', "keywords"])

In [36]:
df_submission['ID'] = df_submission['ID'] + '_' + df_submission['variable']
df_submission.columns = ["ID", "variable", "Target"]
df_submission = df_submission[['ID', "Target"]].set_index("ID")

In [37]:
df_submission.to_csv(pwd + "/data/submissions/submission_v8.csv")