### Import Libraries

In [1]:
import re
import os
import pandas as pd
import pickle
import fitz

from langchain import PromptTemplate
from langchain.llms import CTransformers
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

  from .autonotebook import tqdm as notebook_tqdm


### Load Data

In [2]:
directory = os.listdir('2023 Competitor Docs')
data = []

for state_folder in directory:
    state_path = '2023 Competitor Docs/' + state_folder
    state_directory = os.listdir(state_path)

    for file in state_directory:
        file_path = state_path + '/' + file
        pdf_file = fitz.open(file_path)

        text = ''

        for page in pdf_file:
            extracted_text = page.get_text()
            text += extracted_text + '\n\n'
                    
        data.append([state_folder, file, text])

documents = pd.DataFrame(data, columns = ['state', 'file', 'text'])

In [4]:
#Strip spaces and dashes from file names
documents['file'] = documents['file'].str.replace('-', '') #strip dashes
documents['file'] = documents['file'].str.replace(' ', '') #strip spaces
documents['file'] = documents['file'].str.replace('_', '') #strip underscores

### Load Target

In [5]:
benefits_grid = pd.read_csv('Benefits Grid.csv')

In [6]:
benefits_grid = benefits_grid.rename(columns = {'CONTRACT_PLAN': 'contract_plan'})
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace(' ', '')
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace('-', '') #strip dashes

In [7]:
#Keep only benefits grid columns we care about
benefits_grid = benefits_grid[['County','Provider','contract_plan','Implant Coverage (Y/N)','Root Canal Coverage (Y/N)','Healthy Food Rollover','OTC Rollover (Y/N)']]

In [8]:
#Drop rows with NA
benefits_grid = benefits_grid.dropna()

### Clean Target

In [9]:
#Function to group targets correctly
def process_text(text):
    y_variations = ['Y','Y ','Y  ','Y?','Y, one month will carry over to the next month only within the same calendar quarter','Y -- carries over each month and expires at the end of the year','Y -- $20 monthly allowance rolls over to next month and expires at the end of the year','Y -- $35 monthly allowance rolls over each month and expires at the end of the year','Y -- $30 monthly allowance rolls over each month and expires at the end of the year']
    if isinstance(text, str):
        for y_variation in y_variations:
            if re.search(re.escape(y_variation), text, re.IGNORECASE):
                return 'Y'
        return 'N'
    return 'N'

In [10]:
#Process targets
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].apply(process_text)
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].apply(process_text)
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].apply(process_text)
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].apply(process_text)

In [11]:
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.upper()
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.upper()
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.upper()
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.upper()

### Join Documents to Target

In [12]:
plan_list = benefits_grid[benefits_grid['contract_plan'] != 'EOC']['contract_plan']

In [13]:
def find_contract_plan(file_name):
    for plan in plan_list:
        if plan in file_name:
            return plan

In [14]:
documents['contract_plan_file'] = documents.apply(lambda row : find_contract_plan(row['file']), axis = 1)
documents['contract_plan_text'] = documents.apply(lambda row : find_contract_plan(row['text']), axis = 1)
documents['contract_plan'] = documents.contract_plan_file.combine_first(documents.contract_plan_text)
#documents = documents.drop(['contract_plan_text', 'contract_plan_file'], axis = 1)

In [15]:
#Minor text cleaning to remove \xa0 characters
#documents['text'] = documents['text'].replace('\xa0', ' ')
documents['text'] = documents['text'].apply(lambda x: re.sub(r'\xa0', ' ', x))

In [16]:
#Look for start and end phrases
start_phrase = 'You will see this apple next'
end_phrase = 'What services are not covered'

# Function to extract text between the start and end phrases
def extract_text(text):
    start_index = text.find(start_phrase)
    end_index = text.find(end_phrase, start_index + len(start_phrase))
    if start_index != -1 and end_index != -1 and start_index < end_index:
        return text[start_index + len(start_phrase):end_index].strip()
    return ''

In [17]:
#Apply function to the text column
documents['text_cleaned'] = documents['text'].apply(extract_text)

In [18]:
#Merge document data with benefits grid
dataset = pd.merge(benefits_grid, documents, how = 'inner', on = ['contract_plan'])

In [20]:
docs = dataset["text_cleaned"].to_list()

In [22]:
dataset_file = open('docs.pkl', 'ab')
pickle.dump(docs, dataset_file)
dataset_file.close()

dataset_file = open('dataset.pkl', 'ab')
pickle.dump(dataset, dataset_file)
dataset_file.close()

### Coverages

In [2]:
dataset_file = open('docs.pkl', 'rb')
docs = pickle.load(dataset_file)
dataset_file.close()

def check_coverages(doc):
    root_canals = []
    implants = []
    otc = []
    food = []
    
    with open('doc.txt', 'w') as f:
        f.write(doc)
    
    loader = TextLoader('doc.txt')
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(chunk_size = 512, chunk_overlap = 0)
    texts = splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(
        model_name = 'sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs = {'device': 'cpu'})

    db = FAISS.from_documents(texts, embeddings)
    db.save_local('faiss')

    template = """You are given a set of subreports indicating whether various items are mentioned.

    {question}  If they are, the final response for that item should be Yes.
    Include no other information besides what is asked for.  If any of the information is unclear, the response should be No.

    Final report format:
    Root canals mentioned: (Yes or No)
    Implants mentioned: (Yes or No)
    OTC benefits rollover: (Yes or No)
    Healthy food benefits rollover: (Yes or No)

    Set of subreports: {context}
    """

    llm = CTransformers(model = 'llama-2-7b-chat.ggmlv3.q8_0.bin',
                    model_type = 'llama',
                    config = {'max_new_tokens': 165, 'temperature': 0.01})

    db = FAISS.load_local('faiss', embeddings)
    retriever = db.as_retriever(search_kwargs = {'k': 2})

    prompt = PromptTemplate(
        template = template,
        input_variables = ['context', 'question'])

    qa_llm = RetrievalQA.from_chain_type(llm = llm,
                                     chain_type = 'stuff',
                                     retriever = retriever,
                                     return_source_documents = True,
                                     chain_type_kwargs = {'prompt': prompt}
                                     )
    
    prompt = 'Create a final report indicating whether these items are mentioned in any of the subreports.'
    
    output = qa_llm({'query': prompt})
    results = output['result']

    index_rt = results.find("Root canals mentioned: ")
    letter_rt = results[index_rt + len("Root canals mentioned: ")]
    root_canals.append(letter_rt)
        
    index_im = results.find("Implants mentioned: ")
    letter_im = results[index_im + len("Implants mentioned: ")]
    implants.append(letter_im)
        
    index_otc = results.find("OTC benefits rollover: ")
    letter_otc = results[index_otc + len("OTC benefits rollover: ")]
    otc.append(letter_otc)
        
    index_food = results.find("Healthy food benefits rollover: ")
    letter_food = results[index_food + len("Healthy food benefits rollover: ")]
    food.append(letter_food)

In [3]:
def check_coverages(doc):
    root_canals = []
    implants = []
    otc = []
    food = []
    
    with open('doc.txt', 'w') as f:
        f.write(doc)
    
    loader = TextLoader('doc.txt')
    documents = loader.load()

    model = 'meta-llama/Llama-2-7b-chat-hf'

    splitter = RecursiveCharacterTextSplitter(chunk_size = 1600,
                                              chunk_overlap = 20,)
    
    texts = splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(
        model_name = model,
        model_kwargs = {'device': 'cpu'},
        encode_kwargs = {'normalize_embeddings': True})
    
    embeddings.client.tokenizer.pad_token =  embeddings.client.tokenizer.eos_token

    db = FAISS.from_documents(texts, embeddings)
    db.save_local('faiss')

    template = """You are given a set of subreports indicating whether various items are mentioned.

    {question}  If they are, the final response for that item should be Yes.
    Include no other information besides what is asked for.  If any of the information is unclear, the response should be No.

    Final report format:
    Root canals mentioned: (Yes or No)
    Implants mentioned: (Yes or No)
    OTC benefits rollover: (Yes or No)
    Healthy food benefits rollover: (Yes or No)

    Set of subreports: {context}
    """

    # llm = CTransformers(model = model,
    #                     model_type = 'llama',
    #                     config = {'max_new_tokens': 200, 'temperature': 0.01})

    generation_config = GenerationConfig.from_pretrained(model)
    generation_config.max_new_tokens = 200
    generation_config.temperature = 0.01
    
    llm = pipeline(
        'text-generation',
        model = model,
        # tokenizer=tokenizer,
        generation_config = generation_config,
    )

    db = FAISS.load_local('faiss', embeddings)
    retriever = db.as_retriever(search_kwargs = {'k': 2})

    prompt = PromptTemplate(
        template = template,
        input_variables = ['context', 'question'])

    qa_llm = RetrievalQA.from_chain_type(llm = llm,
                                     chain_type = 'stuff',
                                     retriever = retriever,
                                     return_source_documents = True,
                                     chain_type_kwargs = {'prompt': prompt},
                                     )
    
    prompt = 'Create a final report indicating whether these items are mentioned in any of the subreports.'
    
    output = qa_llm({'query': prompt})
    results = output['result']

    index_rt = results.find("Root canals mentioned: ")
    letter_rt = results[index_rt + len("Root canals mentioned: ")]
    root_canals.append(letter_rt)
        
    index_im = results.find("Implants mentioned: ")
    letter_im = results[index_im + len("Implants mentioned: ")]
    implants.append(letter_im)
        
    index_otc = results.find("OTC benefits rollover: ")
    letter_otc = results[index_otc + len("OTC benefits rollover: ")]
    otc.append(letter_otc)
        
    index_food = results.find("Healthy food benefits rollover: ")
    letter_food = results[index_food + len("Healthy food benefits rollover: ")]
    food.append(letter_food)

In [4]:
for doc in docs:
    check_coverages(doc)

No sentence-transformers model found with name /Users/britt/.cache/torch/sentence_transformers/meta-llama_Llama-2-7b-chat-hf. Creating a new one with MEAN pooling.
Loading checkpoint shards: 100%|██████████| 2/2 [02:32<00:00, 76.06s/it] 


: 

### Summaries

In [None]:
def summarize_coverage(doc):
    summaries = []
    
    with open('doc.txt', 'w') as f:
        f.write(doc)
    
    loader = TextLoader('doc.txt')
    documents = loader.load()

    model = 'meta-llama/Llama-2-7b-chat-hf'

    splitter = RecursiveCharacterTextSplitter(chunk_size = 1600,
                                              chunk_overlap = 20,)
    
    texts = splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(
        model_name = model,
        model_kwargs = {'device': 'cpu'},
        encode_kwargs = {'normalize_embeddings': True})
    
    embeddings.client.tokenizer.pad_token =  embeddings.client.tokenizer.eos_token

    db = FAISS.from_documents(texts, embeddings)
    db.save_local('faiss')

    template = """You are reviewing coverage documents in order to summarize the dental benefits covered by a health plan.
    {question}, in particular the following:

    - Annual maximum for dental coverage.
    - Member cost share (coinsurance) for dental benefits.
    - What dental services and procedures are covered by the plan?
    - Are periodtonal surgery and dental implants covered?
    - What does the member pay in-network vs. out of network?

    Coverage documents: {context}
    """

    # llm = CTransformers(model = model,
    #                     model_type = 'llama',
    #                     config = {'max_new_tokens': 200, 'temperature': 0.01})

    generation_config = GenerationConfig.from_pretrained(model)
    generation_config.max_new_tokens = 200
    generation_config.temperature = 0.01
    
    llm = pipeline(
        'text-generation',
        model = model,
        # tokenizer=tokenizer,
        generation_config = generation_config,
    )

    db = FAISS.load_local('faiss', embeddings)
    retriever = db.as_retriever(search_kwargs = {'k': 2})

    prompt = PromptTemplate(
        template = template,
        input_variables = ['context', 'question'])

    qa_llm = RetrievalQA.from_chain_type(llm = llm,
                                     chain_type = 'stuff',
                                     retriever = retriever,
                                     return_source_documents = True,
                                     chain_type_kwargs = {'prompt': prompt},
                                     )
    
    prompt = 'Summarize the dental benefits if they are present'
    
    output = qa_llm({'query': prompt})
    results = output['result']
    summaries.append(results)