### Setup

In [1]:
#Install packages
#pip install openai
#pip install scikit-learn
#pip install numpy
#pip install matplotlib
#pip install regez
#pip install fitz
#pip install pymupdf #Need this to use fitz
#pip install nltk
#pip install string
#pip install PyPDF2
#pip install pytesseract

In [2]:
#Import packages
import os
import openai
#from openai import OpenAI
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt
import string
import fitz
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, roc_auc_score, roc_curve
import io
import PyPDF2
import pytesseract
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import TextLoader, DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.chains import LLMChain, ReduceDocumentsChain, MapReduceDocumentsChain, StuffDocumentsChain
import warnings
import openai
import requests
import tiktoken
import time

### Load All Data

In [3]:
warnings.filterwarnings('ignore')

In [4]:
directory = os.listdir('Competitor_Docs')
data = []

for state_folder in directory:
    state_path = 'Competitor_Docs/' + state_folder
    state_directory = os.listdir(state_path)

    for file in state_directory:
        file_path = state_path + '/' + file
        pdf_file = fitz.open(file_path)

        text = ''

        for page in pdf_file:
            extracted_text = page.get_text()
            text += extracted_text + '\n\n'

        data.append([state_folder, file, text])

documents = pd.DataFrame(data, columns = ['state', 'file', 'text'])

In [5]:
documents

Unnamed: 0,state,file,text
0,Pennsylvania,Geisinger Gold Preferred Advantage Rx PPO EOC.pdf,EVIDENCE OF COVERAGE \n2023 \nGeisinger Gold P...
1,Pennsylvania,H5525060000_EOC23(3).pdf,H5525_EOC_MAPD_PPO_060000_2023_C\nH5525060000E...
2,Pennsylvania,H3916_Highmark_PA_Central PA_Community Blue Me...,"\nJanuary 1 – December 31, 2023 \n \nEviden..."
3,Pennsylvania,BlueJourney Essential (HMO)_EOC(1).pdf,"OMB Approval 0938-1051 (Expires: February 29, ..."
4,Pennsylvania,Aetna Medicare Advantra Credit Value (PPO) H55...,2023 Evidence of Coverage for Aetna Medicare A...
...,...,...,...
1622,Tennessee,AmerivantageClassicPlusHMOPOS_EOC_H5828-005-00...,The details of your plan\n2023 Evidence of Cov...
1623,Tennessee,H2577-007-000_UHC_TN_Nashville_ZeroPPO_EOC.pdf,Evidence of \nCoverage 2023\nAARP® Medicare Ad...
1624,Tennessee,AmerivantageClassicHMO_EOC_H2593-022-000(2).pdf,The details of your plan\n2023 Evidence of Cov...
1625,Tennessee,H5216180000_EOC23(2).pdf,H5216_EOC_MAPD_PPO_180000_2023_C\nH5216180000E...


In [6]:
#Strip spaces and dashes from file names
documents['file'] = documents['file'].str.replace('-', '') #strip dashes
documents['file'] = documents['file'].str.replace(' ', '') #strip spaces
documents['file'] = documents['file'].str.replace('_', '') #strip underscores

### Load and Clean Targets

In [7]:
benefits_grid = pd.read_csv('Benefits Grid.csv')

In [8]:
benefits_grid = benefits_grid.rename(columns = {'CONTRACT_PLAN': 'contract_plan'})
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace(' ', '')
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace('-', '') #strip dashes

In [9]:
#Keep only benefits grid columns we care about
benefits_grid = benefits_grid[['County','Provider','contract_plan','Implant Coverage (Y/N)','Root Canal Coverage (Y/N)','Healthy Food Rollover','OTC Rollover (Y/N)']]

In [10]:
#Drop rows with NA
#can modify this to keep "N/A" but remove true blanks
benefits_grid = benefits_grid.dropna()

In [11]:
#Function to group targets correctly
def process_text(text):
    y_variations = ['Y','Y ','Y  ','Y?','Y, one month will carry over to the next month only within the same calendar quarter','Y -- carries over each month and expires at the end of the year','Y -- $20 monthly allowance rolls over to next month and expires at the end of the year','Y -- $35 monthly allowance rolls over each month and expires at the end of the year','Y -- $30 monthly allowance rolls over each month and expires at the end of the year']
    if isinstance(text, str):
        for y_variation in y_variations:
            if re.search(re.escape(y_variation), text, re.IGNORECASE):
                return 'Y'
        return 'N'
    return 'N'

In [12]:
#Process targets
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].apply(process_text)
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].apply(process_text)
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].apply(process_text)
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].apply(process_text)

In [13]:
# #Clean Implant coverage target
# benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.replace('with rider?', 'N') #Change with rider
# benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.strip() #Strip spaces
# benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.replace('?', '') #Strip ?
# benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.replace('Unknown', 'N') #Change Unknown to N
# benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].astype(str).str[0] #Get only first character

In [14]:
# #Clean Root Canal coverage target
# benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.replace('with rider?', 'Y') #Change with rider
# benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.strip() #Strip spaces
# benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.replace('?', '') #Strip ?
# benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.replace('Unknown', 'N') #Change Unknown to N
# benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].astype(str).str[0] #Get only first character

In [15]:
# #Healthy food rollover target
# benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.strip() #Strip spaces
# benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.replace('UNK', 'N')  #Change Unknown to N
# benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.replace('N/A', 'N')  #Change N/A to N
# benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.replace('NC', 'N')  #Change NC to N
# benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].astype(str).str[0] #Get only first character

In [16]:
# #OTC rollover target
# benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.strip() #Strip spaces
# benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.replace('UNK', 'N')  #Change Unknown to N
# benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.replace('N/A', 'N')  #Change N/A to N
# benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.replace('NC', 'N')  #Change NC to N
# benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].astype(str).str[0] #Get only first character

In [17]:
# Do not need to run this
#benefits_grid = benefits_grid.replace(np.nan, 'N')

In [18]:
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.upper()
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.upper()
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.upper()
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.upper()

In [19]:
#Check counts
print(benefits_grid['Implant Coverage (Y/N)'].value_counts())
print(benefits_grid['Root Canal Coverage (Y/N)'].value_counts())
print(benefits_grid['Healthy Food Rollover'].value_counts())
print(benefits_grid['OTC Rollover (Y/N)'].value_counts())

Implant Coverage (Y/N)
N    141
Y    116
Name: count, dtype: int64
Root Canal Coverage (Y/N)
Y    229
N     28
Name: count, dtype: int64
Healthy Food Rollover
N    245
Y     12
Name: count, dtype: int64
OTC Rollover (Y/N)
N    225
Y     32
Name: count, dtype: int64


### Join Documents to Target & Clean Text

In [20]:
plan_list = benefits_grid[benefits_grid['contract_plan'] != 'EOC']['contract_plan']

In [21]:
def find_contract_plan(file_name):
    for plan in plan_list:
        if plan in file_name:
            return plan

In [22]:
documents['contract_plan_file'] = documents.apply(lambda row : find_contract_plan(row['file']), axis = 1)
documents['contract_plan_text'] = documents.apply(lambda row : find_contract_plan(row['text']), axis = 1)
documents['contract_plan'] = documents.contract_plan_file.combine_first(documents.contract_plan_text)
#documents = documents.drop(['contract_plan_text', 'contract_plan_file'], axis = 1)

In [23]:
#Minor text cleaning to remove \xa0 characters
#documents['text'] = documents['text'].replace('\xa0', ' ')
documents['text'] = documents['text'].apply(lambda x: re.sub(r'\xa0', ' ', x))

In [24]:
#Clean text - did not use
#def clean_text(text):
#    text = text.lower()
    #text = re.sub(r'[^\w\s]','', text)
    #text = re.sub(r'\S*@\S*\s*','', text)
#    text = text.replace('\n',' ')
#    text = "".join([char for char in text if char not in string.punctuation])
    
#    #stop_words = set(stopwords.words('english'))
#    tokens = text.split()
#    cleaned_words = [word for word in tokens]   #if word not in stop_words]

#    cleaned_text = ' '.join(cleaned_words)
    
#    return cleaned_text

In [25]:
#documents['text_cleaned'] = documents.apply(lambda row : clean_text(row['text']), axis = 1)

In [26]:
#Look for start and end phrases
start_phrase = 'You will see this apple next'
end_phrase = 'What services are not covered'

# Function to extract text between the start and end phrases
def extract_text(text):
    start_index = text.find(start_phrase)
    end_index = text.find(end_phrase, start_index + len(start_phrase))
    if start_index != -1 and end_index != -1 and start_index < end_index:
        return text[start_index + len(start_phrase):end_index].strip()
    return ''

In [27]:
#Apply function to the text column
documents['text_cleaned'] = documents['text'].apply(extract_text)

In [28]:
#Add length of text columns
documents['text_cleaned_length'] = documents['text_cleaned'].str.len()
documents['raw_text_length'] = documents['text'].str.len()

In [29]:
#documents

In [30]:
# #Keep only tokens after certain key phrase is found - did not use

# # Define the phrase you want to search for
# target_phrase = "You will see this apple next"

# # Create a new column to store the extracted words
# documents['extracted_text'] = ''

# # Iterate through each row in the dataframe
# for index, row in documents.iterrows():
#     text = row['text']
    
#     # Find the position of the target phrase in the text
#     match = re.search(re.escape(target_phrase), text, re.IGNORECASE)
    
#     if match:
#         # Get the index of the target phrase in the text
#         start_index = match.end()
        
#         # Extract the words after the target phrase
#         text_after_target = text[start_index:]
#         #words_after_target = text[start_index:].split()[:12000]
        
#         # Join the extracted words and store them in the 'extracted_text' column
#         documents.at[index, 'extracted_text'] = text_after_target
#         #documents.at[index, 'extracted_text'] = ' '.join(words_after_target)

In [31]:
#Merge document data with benefits grid
dataset = pd.merge(benefits_grid, documents, how = 'inner', on = ['contract_plan'])

In [32]:
#dataset

In [33]:
#Count distinct matches
print(len(pd.unique(dataset['contract_plan'])))

183


In [34]:
benefits_grid = benefits_grid.rename(columns = {'CONTRACT_PLAN': 'contract_plan'})

In [35]:
eoc_dataset = dataset[dataset['file'].str.contains('EOC')]
#eoc_dataset = eoc_dataset.drop('text', axis = 1)

In [36]:
#Create dataset with targets we care about
model_df = eoc_dataset[['County', 'Provider', 'contract_plan', 'text_cleaned', 'Implant Coverage (Y/N)','Root Canal Coverage (Y/N)',
                 'Healthy Food Rollover','OTC Rollover (Y/N)']]

In [37]:
#Change N and Y to 0 and 1
model_df['Implant Coverage (Y/N)'] = model_df['Implant Coverage (Y/N)'].map({'Y': 1, 'N': 0})
model_df['Root Canal Coverage (Y/N)'] = model_df['Root Canal Coverage (Y/N)'].map({'Y': 1, 'N': 0})
model_df['Healthy Food Rollover'] = model_df['Healthy Food Rollover'].map({'Y': 1, 'N': 0})
model_df['OTC Rollover (Y/N)'] = model_df['OTC Rollover (Y/N)'].map({'Y': 1, 'N': 0})

In [38]:
model_df

Unnamed: 0,County,Provider,contract_plan,text_cleaned,Implant Coverage (Y/N),Root Canal Coverage (Y/N),Healthy Food Rollover,OTC Rollover (Y/N)
0,AL: Birmingham,"UnitedHealth Group, Inc.",H0432009000,to the preventive services in the benefits cha...,1,1,0,0
1,AL: Huntsville,"UnitedHealth Group, Inc.",H0432009000,to the preventive services in the benefits cha...,1,1,0,0
2,AL: Birmingham,"UnitedHealth Group, Inc.",H2802044000,to the preventive services in the benefits cha...,1,1,0,0
3,AL: Huntsville,CIGNA,H4513055000,to the preventive services in the benefits cha...,0,1,0,0
4,AL: Huntsville,Humana Inc.,H5619093000,to the preventive services in the benefits cha...,0,1,1,1
...,...,...,...,...,...,...,...,...
366,TX: Houston,Centene Corporation,H0174009000,to the preventive services in the benefits cha...,0,1,0,0
367,TX: Houston,Memorial Hermann Health System,H7115003000,to the preventive services in the benefits cha...,0,1,0,0
368,TX: Houston,"UnitedHealth Group, Inc.",H0332008000,to the preventive services in the benefits cha...,1,1,0,0
369,TX: San Antonio,"UnitedHealth Group, Inc.",H1278005000,to the preventive services in the benefits cha...,1,1,0,0


In [39]:
#Try 5 rows as sample
#model_df2 = model_df.iloc[7:12]
#model_df2

In [40]:
#model_df2['text_sample'] = ['']
#model_df2["text_sample"] = ['2023  Evidence of Coverage for UnitedHealthcare Dual Complete® (HMO-POS D-SNP) Chapter 4: Medical Benefits Chart (what is covered and what you pay) 102Covered Routine Dental Benefits Included with Your Plan:    Annual Maximum:  $ 3,000 In general, preventive and routine dental services are not covered under Original Medicare.', 'None listed'] 


In [41]:
#Turn text into list
#docs = []
docs = model_df["text_cleaned"].to_list()

In [42]:
#model_df['extracted_text'].to_csv('file.csv', index = False)

In [43]:
import requests

In [44]:
model_df

Unnamed: 0,County,Provider,contract_plan,text_cleaned,Implant Coverage (Y/N),Root Canal Coverage (Y/N),Healthy Food Rollover,OTC Rollover (Y/N)
0,AL: Birmingham,"UnitedHealth Group, Inc.",H0432009000,to the preventive services in the benefits cha...,1,1,0,0
1,AL: Huntsville,"UnitedHealth Group, Inc.",H0432009000,to the preventive services in the benefits cha...,1,1,0,0
2,AL: Birmingham,"UnitedHealth Group, Inc.",H2802044000,to the preventive services in the benefits cha...,1,1,0,0
3,AL: Huntsville,CIGNA,H4513055000,to the preventive services in the benefits cha...,0,1,0,0
4,AL: Huntsville,Humana Inc.,H5619093000,to the preventive services in the benefits cha...,0,1,1,1
...,...,...,...,...,...,...,...,...
366,TX: Houston,Centene Corporation,H0174009000,to the preventive services in the benefits cha...,0,1,0,0
367,TX: Houston,Memorial Hermann Health System,H7115003000,to the preventive services in the benefits cha...,0,1,0,0
368,TX: Houston,"UnitedHealth Group, Inc.",H0332008000,to the preventive services in the benefits cha...,1,1,0,0
369,TX: San Antonio,"UnitedHealth Group, Inc.",H1278005000,to the preventive services in the benefits cha...,1,1,0,0


### Map-Reduce to Extract Benefits

#### Extraction Prompt

In [45]:
api_token = 'INSERT TOKEN HERE'
gpt = GPT(use_case="EOC summarization", deployment_name="gpt-35-turbo-16k-0613", api_token=api_token, temperature=0.8)
#Can modify temperature
warnings.filterwarnings('ignore')
llm = gpt

#Map Reduce Template
#Map Template
map_template = """You are reviewing coverage documents to determine if various items are mentioned.

Provide a report with the information below.  Include no other information besides what is asked for.

Root canals (also called endodontics, endodontic treatment, or endodontic services) mentioned: (Yes or No).
Dental implants mentioned: (Yes or No).
Over-the-counter (OTC) benefits rollover allowed: (Yes or No).
Healthy options allowance rollover allowed (may be called healthy foods benefit as well): (Yes or No).

The rollover allowances may be shown as "Unused funds will roll over to the next month" or a similar statement in the document.

Coverage documents: {docs}
"""

#Determine each of the following in the text chunk:
#Dental annual max: (respond in USD)
#Root canals covered: (respond in yes/no)
#Implants covered: (respond in yes/no)
#Crowns covered: (respond in yes/no).

map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

#Reduce Template
reduce_template = """You are given a set of subreports indicating whether various items are mentioned.

Create a final report indicating whether these items are mentioned in any of the subreports.  If they are Yes in any of the subreports, the final response for that item should be Yes.
Include no other information besides what is asked for.  If any of the information is unclear, the response should be No.

Final report format:
Root canals mentioned: (Yes or No)
Implants mentioned: (Yes or No)
OTC benefits rollover: (Yes or No)
Healthy food benefits rollover: (Yes or No)

Set of subreports: {doc_summaries}
"""

#Removed this line
#Some of the dental coverage reports may not contain dental benefit information.

reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

#Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    #This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    #If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    #The maximum number of tokens to group documents into.
    token_max=10000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,  #Can change this to True to see output of map step
)

#counter = 1
#Create empty list to store results
results = []

# loop through documents and extract benefits for each
for text in docs:

    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-16k-0613")
    num_tokens = len(encoding.encode(text))

    #Initialize the splitter
    #Should be space for 15k tokens. Length function will count chunk size and overlap based on LLM's tokenizer.
    #Start at 10% chunk overlap
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000, separators=["\n\n", "\n", " ", ""], length_function=llm.get_num_tokens)

    # Split docs into texts
    texts = text_splitter.split_text(text)

    # Create documents from texts
    document_texts = text_splitter.create_documents(texts)

    model_summary = map_reduce_chain.run(document_texts)
    #response = map_reduce_chain(document_texts)
    #intermediate_steps = response["intermediate_steps"]
    #model_extraction = response["output_text"]
    #print(intermediate_steps)

    #Add results to results list
    results.append(model_summary)
    time.sleep(10)

    #print("Document #{}:\n".format(counter), model_summary)
    #counter = counter + 1

In [46]:
#Function to replace new line characters
def strip_newlines(text_list):
    stripped_list = [text.replace('\n', ' ') for text in text_list]
    return stripped_list

In [47]:
results = strip_newlines(results)

In [48]:
#results

In [49]:
#Iterate through results and append Y or N to dataframe

root_canals = []
implants = []
otc = []
food = []

for text in results:
    index_rt = text.find("Root canals mentioned: ")
    letter_rt = text[index_rt + len("Root canals mentioned: ")]
    root_canals.append(letter_rt)
    
    index_im = text.find("Implants mentioned: ")
    letter_im = text[index_im + len("Implants mentioned: ")]
    implants.append(letter_im)
    
    index_otc = text.find("OTC benefits rollover: ")
    letter_otc = text[index_otc + len("OTC benefits rollover: ")]
    otc.append(letter_otc)
    
    index_food = text.find("Healthy food benefits rollover: ")
    letter_food = text[index_food + len("Healthy food benefits rollover: ")]
    food.append(letter_food)

In [50]:
#Add results to dataframe
model_df["Root Canal Results"] = root_canals
model_df["Implant Results"] = implants
model_df["OTC Results"] = otc
model_df["Healthy Food Results"] = food

In [51]:
#Change N or Y to 0 or 1
model_df['Root Canal Results'] = model_df['Root Canal Results'].map({'Y': 1, 'N': 0, 'U': 0})
model_df['Implant Results'] = model_df['Implant Results'].map({'Y': 1, 'N': 0, 'U': 0})
model_df['OTC Results'] = model_df['OTC Results'].map({'Y': 1, 'N': 0, 'U': 0})
model_df['Healthy Food Results'] = model_df['Healthy Food Results'].map({'Y': 1, 'N': 0, 'U': 0})

In [52]:
#Convert columns to numeric
model_df[['Root Canal Coverage (Y/N)', 'Root Canal Results', 'Implant Coverage (Y/N)', 'Implant Results', 'OTC Rollover (Y/N)', 'OTC Results', 'Healthy Food Rollover', 'Healthy Food Results']] = model_df[['Root Canal Coverage (Y/N)', 'Root Canal Results', 'Implant Coverage (Y/N)', 'Implant Results', 'OTC Rollover (Y/N)', 'OTC Results', 'Healthy Food Rollover', 'Healthy Food Results']].apply(pd.to_numeric)

In [53]:
#Write function for calculating accuracy and precision
def compare_binary_columns(column1, column2):

    column1 = column1.tolist()
    column2 = column2.tolist()

    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0

    for i in range(len(column1)):
        if column1[i] == 1 and column2[i] == 1:
            true_positive += 1
        elif column1[i] == 0 and column2[i] == 1:
            false_positive += 1
        elif column1[i] == 1 and column2[i] == 0:
            false_negative += 1
        else:
            true_negative += 1

    accuracy = (true_positive + true_negative) / len(column1)
    if true_positive + false_positive != 0:
        precision = true_positive / (true_positive + false_positive)
    else:
        precision = 0

    return accuracy, precision

In [54]:
#Print accuracy and precision results
rc_accuracy, rc_precision = compare_binary_columns(model_df['Root Canal Coverage (Y/N)'], model_df['Root Canal Results'])
implant_accuracy, implant_precision = compare_binary_columns(model_df['Implant Coverage (Y/N)'], model_df['Implant Results'])
otc_accuracy, otc_precision = compare_binary_columns(model_df['OTC Rollover (Y/N)'], model_df['OTC Results'])
food_accuracy, food_precision = compare_binary_columns(model_df['Healthy Food Rollover'], model_df['Healthy Food Results'])

print("Root Canal Accuracy: ", rc_accuracy)
print("Root Canal Precision: ", rc_precision)
print("Implant Accuracy: ", implant_accuracy)
print("Implant Precision: ", implant_precision)
print("OTC Accuracy: ", otc_accuracy)
print("OTC Precision: ", otc_precision)
print("Food Accuracy: ", food_accuracy)
print("Food Precision: ", food_precision)

Root Canal Accuracy:  0.628032345013477
Root Canal Precision:  0.9710144927536232
Implant Accuracy:  0.6630727762803235
Implant Precision:  0.9411764705882353
OTC Accuracy:  0.8463611859838275
OTC Precision:  0.4375
Food Accuracy:  0.9407008086253369
Food Precision:  0.6666666666666666


In [55]:
model_output = model_df[['County', 'Provider', 'contract_plan', 'Implant Coverage (Y/N)','Root Canal Coverage (Y/N)',
                 'Healthy Food Rollover','OTC Rollover (Y/N)', 'Root Canal Results', 'Implant Results', 'OTC Results', 'Healthy Food Results']]
model_output.to_csv('extraction_results.csv', index = False)

In [56]:
#results