### Import Libraries

In [2]:
import os
import pandas as pd
import pdfplumber
import PyPDF2
import pytesseract
import torch
import transformers

from langchain.chains.summarize import load_summarize_chain
from langchain.chains import AnalyzeDocumentChain, MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.docstore.document import Document
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from pdf2image import convert_from_path
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
from PIL import Image
from transformers import AutoTokenizer, BartForConditionalGeneration, BartForQuestionAnswering, pipeline
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


### Extract Text

In [3]:
# Create a function to extract text

def text_extraction(element):
    # Extracting the text from the in-line text element
    line_text = element.get_text()
    return (line_text)

# Extracting tables from the page

def extract_table(pdf_path, page_num, table_num):
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    table_page = pdf.pages[page_num]
    # Extract the appropriate table
    table = table_page.extract_tables()[table_num]
    return table

# Convert table into the appropriate format
def table_converter(table):
    table_string = ''
    # Iterate through each row of the table
    for row_num in range(len(table)):
        row = table[row_num]
        # Remove the line breaker from the wrapped texts
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        # Convert the table into a string 
        table_string += ('|'+'|'.join(cleaned_row)+'|'+'\n')
    # Removing the last line break
    table_string = table_string[:-1]
    return table_string

In [4]:
def extract_text_and_tables(pdf_path):
    # create a PDF file object
    pdfFileObj = open(pdf_path, 'rb')
    # create a PDF reader object
    pdfReaded = PyPDF2.PdfReader(pdfFileObj)

    # Create the list to extract text from each image
    text_per_page = []

    lower_side = 0.0
    upper_side = 0.0
    # We extract the pages from the PDF
    for pagenum, page in enumerate(extract_pages(pdf_path)):
        
        # Initialize the variables needed for the text extraction from the page
        pageObj = pdfReaded.pages[pagenum]
        page_text = []
        text_from_tables = []
        page_content = []
        # Initialize the number of the examined tables
        table_num = 0
        first_element = True
        table_extraction_flag = False
        # Open the pdf file
        pdf = pdfplumber.open(pdf_path)
        # Find the examined page
        page_tables = pdf.pages[pagenum]
        # Find the number of tables on the page
        tables = page_tables.find_tables()

        # Find all the elements
        page_elements = [(element.y1, element) for element in page._objs]
        # Sort all the elements as they appear in the page 
        page_elements.sort(key = lambda a: a[0], reverse = True)

        # Find the elements that composed a page
        for i,component in enumerate(page_elements):
            # Extract the position of the top side of the element in the PDF
            pos = component[0]
            # Extract the element of the page layout
            element = component[1]
            
            # Check if the element is a text element
            if isinstance(element, LTTextContainer):
                # Check if the text appeared in a table
                if table_extraction_flag == False:
                    # Use the function to extract the text and format for each text element
                    line_text = text_extraction(element)
                    # Append the text of each line to the page text
                    page_text.append(line_text)
                    # Append the format for each line containing text
                    page_content.append(line_text)
                else:
                    # Omit the text that appeared in a table
                    pass

            # Check the elements for tables
            if isinstance(element, LTRect):
                # If the first rectangular element
                if first_element == True and (table_num + 1) <= len(tables):
                    # Find the bounding box of the table
                    lower_side = page.bbox[3] - tables[table_num].bbox[3]
                    upper_side = element.y1 
                    # Extract the information from the table
                    table = extract_table(pdf_path, pagenum, table_num)
                    # Convert the table information in structured string format
                    table_string = table_converter(table)
                    # Append the table string into a list
                    text_from_tables.append(table_string)
                    page_content.append(table_string)
                    # Set the flag as True to avoid the content again
                    table_extraction_flag = True
                    # Make it another element
                    first_element = False
                    # Add a placeholder in the text and format lists
                    page_text.append('table')

                # Check if we already extracted the tables from the page
                if element.y0 >= lower_side and element.y1 <= upper_side:
                    pass
                elif not isinstance(page_elements[i+1][1], LTRect):
                    table_extraction_flag = False
                    first_element = True
                    table_num += 1

        page_text = ' '.join([str(elem) for elem in page_text])
        text_from_tables = ' '.join([str(elem) for elem in text_from_tables])
        page_content = ' '.join([str(elem) for elem in page_content])

        text_per_page.append([pagenum, page_text, text_from_tables, page_content])

    # Closing the pdf file object
    pdfFileObj.close()
    return text_per_page

In [16]:
directory = os.listdir('2023 Competitor Docs')
data = []

for state_folder in directory:
    state_path = '2023 Competitor Docs/' + state_folder
    state_directory = os.listdir(state_path)

    for file in state_directory:
        pdf_path = state_path + '/' + file
        pdf_extraction = extract_text_and_tables(pdf_path)
        # print(pdf_path)
        data.append([file, pdf_extraction])

IndexError: list index out of range

In [5]:
pdf_path = '2023 Competitor Docs/Florida/H1036-074-000_Humana_FL_Manatee_HMO_EOC.pdf'
pdf_extraction = extract_text_and_tables(pdf_path)

In [6]:
pdf_extraction_df = pd.DataFrame(pdf_extraction, columns = ['page_num', 'page_text', 'text_from_tables', 'page_content'])
pdf_extraction_df_limited = pdf_extraction_df[pdf_extraction_df['page_content'].str.contains('Chapter 4')]

### Generate Summaries

In [7]:
example = pdf_extraction_df_limited['page_content'].str.cat(sep=' ')
# example = pdf_extraction_df_limited['page_content']

In [63]:
example

'2023 Evidence of Coverage for Humana Gold Plus H1036-074 (HMO)\nTable of Contents\n 6\n 2023 Evidence of Coverage\nTable of Contents\n Chapter 1.\n Getting started as a member.....................................................................9\nSECTION 1 Introduction ..........................................................................................10\nSECTION 2 What makes you eligible to be a plan member?..............................................11\nSECTION 3 Important membership materials you will receive ..........................................11\nSECTION 4 Your monthly costs for Humana Gold Plus H1036-074 (HMO) ............................13\nSECTION 5 More information about your monthly premium.............................................15\nSECTION 6 Keeping your plan membership record up to date ...........................................18\nSECTION 7 How other insurance works with our plan......................................................19\n Chapter 2.\n Im

In [73]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

pipe = pipeline(
    "summarization",
    model = model,
    tokenizer = tokenizer,
    min_length = 5,
    max_length = 24
    )

llm = HuggingFacePipeline(pipeline = pipe)

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size = 1024, chunk_overlap = 0)
texts = text_splitter.split_text(example)

docs = [Document(page_content = t) for t in texts[:]]
chain = load_summarize_chain(llm, chain_type = "map_reduce")
output_summary = chain.run(docs)

In [74]:
output_summary

'Humana Gold Plus H1036-074 (HMO) must cover all services covered by Original'

In [10]:
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings


In [8]:
def check_coverages(doc):
    root_canals = []
    implants = []
    otc = []
    food = []
    
    with open('doc.txt', 'w') as f:
        f.write(doc)
    
    loader = TextLoader('doc.txt')
    documents = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=25)
    texts = text_splitter.split_documents(documents)

    model = "meta-llama/Llama-2-7b-chat-hf"
    # tokenizer = AutoTokenizer.from_pretrained(model)
    
    embeddings = HuggingFaceEmbeddings(
        model_name=model,
        model_kwargs={"device": "cpu"},
        encode_kwargs={"normalize_embeddings": True},
    )
    
    embeddings.client.tokenizer.pad_token =  embeddings.client.tokenizer.eos_token
    
    db = FAISS.from_documents(texts, embeddings)
    db.save_local('faiss')
    
    pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        # tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
        max_length=3000,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id
    )
    
    llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})

    template = """You are reviewing coverage documents in order to summarize the dental benefits covered by a health plan.
    Summarize the dental benefits if they are present, in particular the following:

    - Annual maximum for dental coverage.
    - Member cost share (coinsurance) for dental benefits.
    - What dental services and procedures are covered by the plan?
    - Are periodtonal surgery and dental implants covered?
    - What does the member pay in-network vs. out of network?

    Coverage documents: {context}
    """

    prompt = PromptTemplate(template=template, input_variables = ['context'])
    # llm_chain = LLMChain(prompt=prompt, llm=llm)
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=db.as_retriever(search_kwargs={"k": 2}),
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt},
    )

    results = qa_chain(texts)
    return results

In [11]:
check_coverages(example)

No sentence-transformers model found with name /Users/britt/.cache/torch/sentence_transformers/meta-llama_Llama-2-7b-chat-hf. Creating a new one with MEAN pooling.
Loading checkpoint shards: 100%|██████████| 2/2 [02:48<00:00, 84.29s/it] 


NameError: name 'tokenizer' is not defined

### Benefits

In [26]:
tokenizer = AutoTokenizer.from_pretrained("valhalla/bart-large-finetuned-squadv1")
model = BartForQuestionAnswering.from_pretrained("valhalla/bart-large-finetuned-squadv1")

prompt = 'Are root canals covered?'

pipe = pipeline(
    model = model,
    tokenizer = tokenizer,
    question = prompt,
    task = 'text2text-generation'
    )

llm = HuggingFacePipeline(pipeline = pipe)

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size = 1024, chunk_overlap = 0)
texts = text_splitter.split_text(example)

docs = [Document(page_content = t) for t in texts[:]]
chain = load_summarize_chain(llm, chain_type = "map_reduce")
output_summary = chain.run(docs)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
The model 'BartForQuestionAnswering' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForCondi

TypeError: The current model class (BartForQuestionAnswering) is not compatible with `.generate()`, as it doesn't have a language model head. Please use one of the following classes instead: {'BartForCausalLM', 'BartForConditionalGeneration'}

In [17]:
tokenizer = AutoTokenizer.from_pretrained("valhalla/bart-large-finetuned-squadv1")
model = BartForQuestionAnswering.from_pretrained("valhalla/bart-large-finetuned-squadv1")

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors = "pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

target_start_index = torch.tensor([14])
target_end_index = torch.tensor([15])

outputs = model(**inputs, start_positions = target_start_index, end_positions = target_end_index)
loss = outputs.loss
round(loss.item(), 2)

tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 84.6kB/s]
config.json: 100%|██████████| 1.43k/1.43k [00:00<00:00, 13.1MB/s]
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 21.8MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 18.9MB/s]
special_tokens_map.json: 100%|██████████| 150/150 [00:00<00:00, 1.34MB/s]
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
pytorch_model.bin: 100%|██████████| 1.63G/1.63G [01:06<00:00

0.59

In [7]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [8]:
with open('example.txt', 'w') as f:
    f.write('example')

In [43]:
loader = TextLoader("example.txt")
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size = 512,
                                          chunk_overlap = 50)
texts = splitter.split_documents(documents)

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'})

db = FAISS.from_documents(texts, embeddings)
db.save_local("faiss")

In [11]:
from langchain.llms import CTransformers
from langchain import PromptTemplate
from langchain.chains import RetrievalQA

In [28]:
template = """You are given a set of subreports indicating whether various items are mentioned.

{question}  If they are, the final response for that item should be Yes.
Include no other information besides what is asked for.  If any of the information is unclear, the response should be No.

Final report format:
Root canals mentioned: (Yes or No)
Implants mentioned: (Yes or No)
OTC benefits rollover: (Yes or No)
Healthy food benefits rollover: (Yes or No)

Set of subreports: {context}
"""

In [44]:
llm = CTransformers(model = 'llama-2-7b-chat.ggmlv3.q8_0.bin',
                    model_type = 'llama',
                    config = {'max_new_tokens': 160, 'temperature': 0.01})

embeddings = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs = {'device': 'cpu'})
db = FAISS.load_local("faiss", embeddings)

retriever = db.as_retriever(search_kwargs={'k': 2})
prompt = PromptTemplate(
    template = template,
    input_variables = ['context', 'question'])
qa_llm = RetrievalQA.from_chain_type(llm = llm,
                                     chain_type = 'stuff',
                                     retriever = retriever,
                                     return_source_documents = True,
                                     chain_type_kwargs = {'prompt': prompt}
                                     )

In [45]:
prompt = "Create a final report indicating whether these items are mentioned in any of the subreports."
output = qa_llm({'query': prompt})
results = output["result"]

KeyboardInterrupt: 

In [47]:
print(results)

Subreport 1:
Root canals: Yes
Implants: No
OTC benefits rollover: No
Healthy food benefits rollover: No

Subreport 2:
Root canals: No
Implants: Yes
OTC benefits rollover: Yes
Healthy food benefits rollover: No

Subreport 3:
Root canals: No
Implants: No
OTC benefits rollover: No
Healthy food benefits rollover: Yes

In this example, the final report would be:
Root canals mentioned: Yes
Implants mentioned: Yes
OTC benefits rollover: Yes
Healthy food benefits rollover: Yes


In [41]:
root_canals = []
implants = []
otc = []
food = []

index_rt = results.find("Root canals mentioned: ")
letter_rt = results[index_rt + len("Root canals mentioned: ")]
root_canals.append(letter_rt)
    
index_im = results.find("Implants mentioned: ")
letter_im = results[index_im + len("Implants mentioned: ")]
implants.append(letter_im)
    
index_otc = results.find("OTC benefits rollover: ")
letter_otc = results[index_otc + len("OTC benefits rollover: ")]
otc.append(letter_otc)
    
index_food = results.find("Healthy food benefits rollover: ")
letter_food = results[index_food + len("Healthy food benefits rollover: ")]
food.append(letter_food)