### Import Libraries

In [5]:
import re
import os
import pandas as pd
import fitz
import pickle

from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, BartForConditionalGeneration, pipeline

### Load Data

In [2]:
directory = os.listdir('2023 Competitor Docs')
data = []

for state_folder in directory:
    state_path = '2023 Competitor Docs/' + state_folder
    state_directory = os.listdir(state_path)

    for file in state_directory:
        file_path = state_path + '/' + file
        pdf_file = fitz.open(file_path)

        text = ''

        for page in pdf_file:
            extracted_text = page.get_text()
            text += extracted_text + '\n\n'
                    
        data.append([state_folder, file, text])

documents = pd.DataFrame(data, columns = ['state', 'file', 'text'])

In [3]:
#Strip spaces and dashes from file names
documents['file'] = documents['file'].str.replace('-', '') #strip dashes
documents['file'] = documents['file'].str.replace(' ', '') #strip spaces
documents['file'] = documents['file'].str.replace('_', '') #strip underscores

### Load Target

In [4]:
benefits_grid = pd.read_csv('Benefits Grid.csv')

In [5]:
benefits_grid = benefits_grid.rename(columns = {'CONTRACT_PLAN': 'contract_plan'})
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace(' ', '')
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace('-', '') #strip dashes

In [6]:
#Keep only benefits grid columns we care about
benefits_grid = benefits_grid[['County','Provider','contract_plan','Implant Coverage (Y/N)','Root Canal Coverage (Y/N)','Healthy Food Rollover','OTC Rollover (Y/N)']]

In [7]:
#Drop rows with NA
benefits_grid = benefits_grid.dropna()

### Clean Target

In [8]:
#Function to group targets correctly
def process_text(text):
    y_variations = ['Y','Y ','Y  ','Y?','Y, one month will carry over to the next month only within the same calendar quarter','Y -- carries over each month and expires at the end of the year','Y -- $20 monthly allowance rolls over to next month and expires at the end of the year','Y -- $35 monthly allowance rolls over each month and expires at the end of the year','Y -- $30 monthly allowance rolls over each month and expires at the end of the year']
    if isinstance(text, str):
        for y_variation in y_variations:
            if re.search(re.escape(y_variation), text, re.IGNORECASE):
                return 'Y'
        return 'N'
    return 'N'

In [9]:
#Process targets
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].apply(process_text)
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].apply(process_text)
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].apply(process_text)
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].apply(process_text)

In [10]:
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.upper()
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.upper()
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.upper()
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.upper()

### Join Documents to Target

In [11]:
plan_list = benefits_grid[benefits_grid['contract_plan'] != 'EOC']['contract_plan']

In [12]:
def find_contract_plan(file_name):
    for plan in plan_list:
        if plan in file_name:
            return plan

In [13]:
documents['contract_plan_file'] = documents.apply(lambda row : find_contract_plan(row['file']), axis = 1)
documents['contract_plan_text'] = documents.apply(lambda row : find_contract_plan(row['text']), axis = 1)
documents['contract_plan'] = documents.contract_plan_file.combine_first(documents.contract_plan_text)
#documents = documents.drop(['contract_plan_text', 'contract_plan_file'], axis = 1)

In [14]:
#Minor text cleaning to remove \xa0 characters
#documents['text'] = documents['text'].replace('\xa0', ' ')
documents['text'] = documents['text'].apply(lambda x: re.sub(r'\xa0', ' ', x))

In [15]:
#Look for start and end phrases
start_phrase = 'You will see this apple next'
end_phrase = 'What services are not covered'

# Function to extract text between the start and end phrases
def extract_text(text):
    start_index = text.find(start_phrase)
    end_index = text.find(end_phrase, start_index + len(start_phrase))
    if start_index != -1 and end_index != -1 and start_index < end_index:
        return text[start_index + len(start_phrase):end_index].strip()
    return ''

In [16]:
#Apply function to the text column
documents['text_cleaned'] = documents['text'].apply(extract_text)

In [17]:
#Merge document data with benefits grid
dataset = pd.merge(benefits_grid, documents, how = 'inner', on = ['contract_plan'])

In [18]:
benefits_grid = benefits_grid.rename(columns = {'CONTRACT_PLAN': 'contract_plan'})

In [20]:
dataset

Unnamed: 0,County,Provider,contract_plan,Implant Coverage (Y/N),Root Canal Coverage (Y/N),Healthy Food Rollover,OTC Rollover (Y/N),state,file,text,contract_plan_file,contract_plan_text,text_cleaned
0,AL: Birmingham,"UnitedHealth Group, Inc.",H0432009000,Y,Y,N,N,Alabama,H0432009000UHCALBirminghamFullDualEOC.pdf,Evidence of \nCoverage 2023\nUnitedHealthcare ...,H0432009000,,to the preventive services in the benefits cha...
1,AL: Huntsville,"UnitedHealth Group, Inc.",H0432009000,Y,Y,N,N,Alabama,H0432009000UHCALBirminghamFullDualEOC.pdf,Evidence of \nCoverage 2023\nUnitedHealthcare ...,H0432009000,,to the preventive services in the benefits cha...
2,AL: Birmingham,"UnitedHealth Group, Inc.",H2802044000,Y,Y,N,N,Alabama,H2802044000UHCALBirminghamPartialDualEOC.pdf,Evidence of \nCoverage 2023\nUnitedHealthcare ...,H2802044000,,to the preventive services in the benefits cha...
3,AL: Huntsville,CIGNA,H4513055000,N,Y,N,N,Alabama,CignaTotalCareHMODSNPEOCH4513055000.pdf,"January 1 - December 31, 2023\nEVIDENCE OF COV...",H4513055000,,to the preventive services in the benefits cha...
4,AL: Huntsville,Humana Inc.,H5619093000,N,Y,Y,Y,Alabama,H5619093000EOC23.pdf,H5619_EOC_MAPD_HMO_093000_2023_C\nH5619093000E...,H5619093000,H5619093000,to the preventive services in the benefits cha...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,TX: Houston,Centene Corporation,H0174009000,N,Y,N,N,Texas,H0174009WellcareTexasAustinWellcareAssist(HMO)...,H0174_009_2023_TX_EOC_HMAPD_105873E_C \nOMB Ap...,,H0174009000,to the preventive services in the benefits cha...
383,TX: Houston,Memorial Hermann Health System,H7115003000,N,Y,N,N,Texas,H7115003000MemorialHermannAdvantagePlusHMOEOC.pdf,Memorial Hermann Advantage Plus HMO\n2023 Evid...,H7115003000,,to the preventive services in the benefits cha...
384,TX: Houston,"UnitedHealth Group, Inc.",H0332008000,Y,Y,N,N,Texas,H0332008000GoldCommunityHMOPOSEOC.pdf,H0332_001EOC22_C\n1-866-535-8343 (TTY: 711) ...,H0332008000,,to the preventive services in the benefits cha...
385,TX: San Antonio,"UnitedHealth Group, Inc.",H1278005000,Y,Y,N,N,Texas,H1278005000UHCTXSanAntonioPPOEOC.pdf,Evidence of \nCoverage 2023\nAARP® Medicare Ad...,H1278005000,,to the preventive services in the benefits cha...


In [22]:
docs = dataset["text_cleaned"].to_list()

### Summaries

In [21]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

pipe = pipeline(
    "summarization",
    model = model,
    tokenizer = tokenizer,
    min_length = 5,
    max_length = 21
    )

llm = HuggingFacePipeline(pipeline = pipe)

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size = 1024, chunk_overlap = 0)

In [3]:
def summarize(doc):
    texts = text_splitter.split_text(doc)
    docs = [Document(page_content = t) for t in texts[:]]
    chain = load_summarize_chain(llm, chain_type = "map_reduce")
    output_summary = chain.run(docs)
    return output_summary

In [12]:
dataset_file = open('docs.pkl', 'rb')
docs = pickle.load(dataset_file)
dataset_file.close()

In [11]:
docs[docs['contract_plan'] == 'H1036074000'] 

Unnamed: 0,county,provider,contract_plan,annual_max,implant_coverage,root_canal,healthy_food_rollover,otc_rollover,state,text
191,FL: Manatee,Humana Inc.,H1036074000,"$2,000",N,Y,N,Y,Florida,H1036_EOC_MAPD_HMO_074000_2023_C\nH1036074000E...


In [22]:
docs[191]

'to the preventive services in the benefits chart.  \n \nMedical Benefits Chart \n \nServices that are covered for you\nWhat you must pay \nwhen you get these \nservices in‑network\nWhat you must pay \nwhen you get these \nservices out‑of‑network\n* Services with an asterisk do not apply to your in‑network or combined out‑of‑pocket maximum.\n Abdominal aortic aneurysm screening \nA one‑time screening ultrasound for people at \nrisk. The plan only covers this screening if you \nhave certain risk factors and if you get a referral \nfor it from your physician, physician assistant, \nnurse practitioner, or clinical nurse specialist.\nThere is no coinsurance, \ncopayment, or deductible \nfor members eligible for \nthis preventive screening.\n$0 copay for members \neligible for this preventive \nscreening.\nAcupuncture for chronic low back pain \nCovered services include:\nUp to 12 visits in 90 days are covered for \nMedicare beneficiaries under the following \ncircumstances:\n \nFor the pur

In [23]:
summarize(docs[191])

'Acupuncture for chronic low back pain is covered for Medicare beneficiaries. Medicare reimburses $200'