### Install Packages

In [1]:
!pip install langchain -qqq
!pip install transformers -qqq
!pip install accelerate -qqq
!pip install sentence-transformers -qqq
!pip install faiss-gpu -qqq
!pip install fitz -qqq
!pip install pymupdf -qqq
!pip install flash-attn --no-build-isolation -qqq

In [2]:
!pip install huggingface_hub -qqq
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

### Load Packages

In [13]:
import fitz
import os
import pandas as pd
import pickle
import re
import torch
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

### Load Data

In [8]:
directory = os.listdir('2023 Competitor Docs')
data = []
for state_folder in directory:
    state_path = '2023 Competitor Docs/' + state_folder
    state_directory = os.listdir(state_path)

    for file in state_directory:
        file_path = state_path + '/' + file
        pdf_file = fitz.open(file_path)

        text = ''

        for page in pdf_file:
            extracted_text = page.get_text()
            text += extracted_text + '\n\n'

        data.append([state_folder, file, text])

documents = pd.DataFrame(data, columns = ['state', 'file', 'text'])

In [9]:
#Strip spaces and dashes from file names
documents['file'] = documents['file'].str.replace('-', '') #strip dashes
documents['file'] = documents['file'].str.replace(' ', '') #strip spaces
documents['file'] = documents['file'].str.replace('_', '') #strip underscores

In [10]:
benefits_grid = pd.read_csv('Benefits Grid.csv')

benefits_grid = benefits_grid.rename(columns = {'CONTRACT_PLAN': 'contract_plan'})
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace(' ', '')
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace('-', '') #strip dashes

benefits_grid = benefits_grid[['County','Provider','contract_plan','Implant Coverage (Y/N)','Root Canal Coverage (Y/N)','Healthy Food Rollover','OTC Rollover (Y/N)']]

benefits_grid = benefits_grid.dropna()

In [11]:
#Function to group targets correctly
def process_text(text):
    y_variations = ['Y','Y ','Y  ','Y?','Y, one month will carry over to the next month only within the same calendar quarter','Y -- carries over each month and expires at the end of the year','Y -- $20 monthly allowance rolls over to next month and expires at the end of the year','Y -- $35 monthly allowance rolls over each month and expires at the end of the year','Y -- $30 monthly allowance rolls over each month and expires at the end of the year']
    if isinstance(text, str):
        for y_variation in y_variations:
            if re.search(re.escape(y_variation), text, re.IGNORECASE):
                return 'Y'
        return 'N'
    return 'N'

In [14]:
#Process targets
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].apply(process_text)
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].apply(process_text)
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].apply(process_text)
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].apply(process_text)

benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.upper()
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.upper()
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.upper()
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.upper()

In [15]:
plan_list = benefits_grid[benefits_grid['contract_plan'] != 'EOC']['contract_plan']

In [16]:
def find_contract_plan(file_name):
    for plan in plan_list:
        if plan in file_name:
            return plan

In [17]:
documents['contract_plan_file'] = documents.apply(lambda row : find_contract_plan(row['file']), axis = 1)
documents['contract_plan_text'] = documents.apply(lambda row : find_contract_plan(row['text']), axis = 1)
documents['contract_plan'] = documents.contract_plan_file.combine_first(documents.contract_plan_text)

documents['text'] = documents['text'].apply(lambda x: re.sub(r'\xa0', ' ', x))

In [18]:
#Look for start and end phrases
start_phrase = 'You will see this apple next'
end_phrase = 'What services are not covered'

# Function to extract text between the start and end phrases
def extract_text(text):
    start_index = text.find(start_phrase)
    end_index = text.find(end_phrase, start_index + len(start_phrase))
    if start_index != -1 and end_index != -1 and start_index < end_index:
        return text[start_index + len(start_phrase):end_index].strip()
    return ''

In [19]:
documents['text_cleaned'] = documents['text'].apply(extract_text)

In [20]:
#Merge document data with benefits grid
dataset = pd.merge(benefits_grid, documents, how = 'inner', on = ['contract_plan'])

In [21]:
docs = dataset["text_cleaned"].to_list()

### Llama

In [23]:
MODEL_NAME = 'meta-llama/Llama-2-7b-chat-hf'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,
                                          use_fast = True
                                         )

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype = torch.float32,
    trust_remote_code = True,
    device_map = 'auto')

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 512

text_pipeline = pipeline(
    'text-generation',
    model = model,
    tokenizer = tokenizer,
    generation_config = generation_config)

llm = HuggingFacePipeline(pipeline = text_pipeline)

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 4096,
                                               chunk_overlap = 40)

embeddings = HuggingFaceEmbeddings(
    model_name = 'sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs = {'device': 'cuda'},
)

coverage_check_template = """You are given a set of subreports indicating whether various items are mentioned.

Create a final report indicating whether these items are mentioned in any of the subreports.  If they are, the final response for that item should be Yes.
Include no other information besides what is asked for.  If any of the information is unclear, the response should be No.

Final report format:
Root canals mentioned: (Yes or No)
Implants mentioned: (Yes or No)
OTC benefits rollover: (Yes or No)
Healthy food benefits rollover: (Yes or No)

Set of subreports: {context}
"""

summary_template = """You are reviewing coverage documents in order to summarize the dental benefits covered by a health plan.
Summarize the dental benefits if they are present, in particular the following:

- Annual maximum for dental coverage.
- Member cost share (coinsurance) for dental benefits.
- What dental services and procedures are covered by the plan?
- Are periodtonal surgery and dental implants covered?
- What does the member pay in-network vs. out of network?

Coverage documents: {context}
"""

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
def coverage_check(doc):

    with open('doc.txt', 'w') as f:
        f.write(doc)
    loader = TextLoader('doc.txt')
    docs = loader.load()

    texts = text_splitter.split_documents(docs)

    db = FAISS.from_documents(texts, embeddings)
    db.save_local("faiss")
    db = FAISS.load_local("faiss", embeddings)

    prompt = PromptTemplate(template = coverage_check_template,
                            input_variables = ['context'])

    qa_chain = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type = 'stuff',
        retriever = db.as_retriever(search_kwargs = {'k': 2}),
        return_source_documents = True,
        chain_type_kwargs = {'prompt': prompt},
    )

    output = qa_chain('query')
    results = output["result"]

    coverages.append(results)

    index_rt = results.find("Root canals mentioned: ")
    letter_rt = results[index_rt + len("Root canals mentioned: ")]
    root_canals.append(letter_rt)

    index_im = results.find("Implants mentioned: ")
    letter_im = results[index_im + len("Implants mentioned: ")]
    implants.append(letter_im)

    index_otc = results.find("OTC benefits rollover: ")
    letter_otc = results[index_otc + len("OTC benefits rollover: ")]
    otc.append(letter_otc)

    index_food = results.find("Healthy food benefits rollover: ")
    letter_food = results[index_food + len("Healthy food benefits rollover: ")]
    food.append(letter_food)

In [25]:
def summarize(doc):

    with open('doc.txt', 'w') as f:
        f.write(doc)
    loader = TextLoader('doc.txt')
    docs = loader.load()

    texts = text_splitter.split_documents(docs)

    db = FAISS.from_documents(texts, embeddings)
    db.save_local("faiss")
    db = FAISS.load_local("faiss", embeddings)

    prompt = PromptTemplate(template = summary_template,
                            input_variables = ['context'])

    qa_chain = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type = 'stuff',
        retriever = db.as_retriever(search_kwargs = {'k': 2}),
        return_source_documents = True,
        chain_type_kwargs = {'prompt': prompt},
    )

    output = qa_chain('query')
    result = output['result'].strip()

    summaries.append(result)

In [None]:
root_canals = []
implants = []
otc = []
food = []
coverages = []

for doc in docs:
    try:
        coverage_check(doc)
    except:
        root_canals.append('error')
        implants.append('error')
        otc.append('error')
        food.append('error')
        coverages.append('error')

In [None]:
summaries = []

for doc in docs:
    try:
        coverage_check(doc)
    except:
        summaries.append('error')