### Import Libraries

In [26]:
from nltk.corpus import stopwords
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
import string
import fitz
import pickle
from sklearn.model_selection import train_test_split
import numpy as np

### Load Data

In [2]:
directory = os.listdir('2023 Competitor Docs')
data = []

for state_folder in directory:
    state_path = '2023 Competitor Docs/' + state_folder
    state_directory = os.listdir(state_path)

    for file in state_directory:
        file_path = state_path + '/' + file
        pdf_file = fitz.open(file_path)

        text = ''

        for page in pdf_file:
            extracted_text = page.get_text()
            text += extracted_text + '\n\n'
                    
        data.append([state_folder, file, text])

documents = pd.DataFrame(data, columns = ['state', 'file', 'text'])

### Load Target

In [38]:
benefits_grid = pd.read_csv('Benefits Grid.csv')

In [39]:
benefits_grid = benefits_grid.rename(columns = {'CONTRACT_PLAN' : 'contract_plan',
                                                'County' : 'county',
                                                'Provider' : 'provider',
                                                'Annual Max ' : 'annual_max',
                                                'Implant Coverage (Y/N)' : 'implant_coverage',
                                                'Root Canal Coverage (Y/N)' : 'root_canal',
                                                'Healthy Food Rollover' : 'healthy_food_rollover',
                                                'OTC Rollover (Y/N)' : 'otc_rollover'})

benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace(' ', '')
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace('-', '')
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace('_', '')

In [40]:
benefits_grid = benefits_grid[['county',
                               'provider',
                               'contract_plan',
                               'annual_max',
                               'implant_coverage',
                               'root_canal',
                               'healthy_food_rollover',
                               'otc_rollover']]

In [41]:
benefits_grid.head()

Unnamed: 0,county,provider,contract_plan,annual_max,implant_coverage,root_canal,healthy_food_rollover,otc_rollover
0,AL: Birmingham,"UnitedHealth Group, Inc.",H0432009000,"$3,000",Y,Y,N,N
1,AL: Birmingham,"UnitedHealth Group, Inc.",H2802044000,"$2,000",Y,Y,N,N
2,AL: Birmingham,"Triton Health Systems, LLC",H0154012000,"$2,250",Y,Y,"Y, one month will carry over to the next month...",N
3,AL: Birmingham,"UnitedHealth Group, Inc.",H2802041000,"$2,000",Y,Y,,N
4,AL: Birmingham,"Triton Health Systems, LLC",H0154015001,"$1,000",N?,Y,,N


### Clean Target

In [58]:
benefits_grid = benefits_grid.replace(to_replace = ['Y?',
                                                    'Y (rider)',
                                                    'Y (allowance)',
                                                    'Y (most expensive rider)',
                                                    'Y (only with optional)',
                                                    'Y (only if rider is purchased)',
                                                    'with rider?',
                                                    'Y (only optional plans)',
                                                    'Y (with optional buy-in)',
                                                    'Y ',
                                                    'Y -- carries over each month and expires at the end of the year',
                                                    'Y -- $20 monthly allowance rolls over to next month and expires at the end of the year',
                                                    'Y -- $35 monthly allowance rolls over each month and expires at the end of the year',
                                                    'Y -- $30 monthly allowance rolls over each month and expires at the end of the year',
                                                    'Y, one month will carry over to the next month only within the same calendar quarter',
                                                    'Y  '],
                                      value = 'Y')

benefits_grid = benefits_grid.replace(to_replace = ['N?',
                                                    'N ',
                                                    'Unknown ',
                                                    '?',
                                                    'Unknown',
                                                    'UNK',
                                                    'N  ',
                                                    'UNK -- no mention in EOC',
                                                    'N -- note that EOC states that exceptions may apply.',
                                                    'NC'],
                                      value = 'N')

benefits_grid.fillna('N', inplace = True)

In [59]:
print(benefits_grid['implant_coverage'].unique())
print(benefits_grid['root_canal'].unique())
print(benefits_grid['healthy_food_rollover'].unique())
print(benefits_grid['otc_rollover'].unique())

['Y' 'N']
['Y' 'N']
['N' 'Y']
['N' 'Y']


In [60]:
print(benefits_grid['annual_max'].unique())

['$3,000' '$2,000' '$2,250' '$1,000' '$1,500' '$2,500' '$1,250' '$1,100'
 '$500' '$750' '$4,000' '$750/$1,250/$1,750' '$500/$1000/$2000' '$1,200'
 '$1,500 (rider)' '$2,000 (best rider)' '$6,000' 'Unlimited?' 'Unlimited*'
 'Unlimited' '$1,900' '$2,100' '$1,600' '$5,000' 'N' '$1,275'
 '$2,000/$3000 (rider)' '$3,500' '$2,000 (best rider option)'
 '$1000 + option riders' '$1,000, ($1,500, $2,000)'
 '$2,000 (optional rider)' '$2,000 (best rider plan)' '$2,200'
 '$0/$1,250 max with optional rider' '$1,500 (best rider)'
 '$1,000/$2,000 (rider)' '$20,000*' '$1,500/$2,000' '$2,000 (rider)'
 '$1,200 + riders' '$500/$2,000' '$2,000 (allowance)' '$20,000'
 '$3,000 (allowance)' '$1,000/$2,000' '$2,000/4,000' '$1,000 (allowance)'
 '$1,250 (allowance)' '$500 (allowance)' '$2000 (allowance)'
 '$1,500 (optional)' '$2,750' '$500/$1,500 (optional)'
 '$1,000/$1,000 (optional)' '$2,500 (allowance)' 'N/A?']


### Join Documents to Target

In [61]:
plan_list = benefits_grid[benefits_grid['contract_plan'] != 'EOC']['contract_plan']

In [62]:
documents['file'] = documents['file'].str.replace(' ', '')
documents['file'] = documents['file'].str.replace('-', '')
documents['file'] = documents['file'].str.replace('_', '')

documents['text_contract_plan'] = documents['text'].str.replace(' ', '')
documents['text_contract_plan'] = documents['text_contract_plan'].str.replace('-', '')
documents['text_contract_plan'] = documents['text_contract_plan'].str.replace('_', '')

In [63]:
def find_contract_plan(file_name):
    for plan in plan_list:
        if plan in file_name:
            return plan

In [64]:
documents['contract_plan_file'] = documents.apply(lambda row : find_contract_plan(row['file']), axis = 1)
documents['contract_plan_text'] = documents.apply(lambda row : find_contract_plan(row['text_contract_plan']), axis = 1)
documents['contract_plan'] = documents.contract_plan_file.combine_first(documents.contract_plan_text)
documents = documents.drop(['contract_plan_text', 'contract_plan_file'], axis = 1)

In [65]:
dataset = pd.merge(benefits_grid, documents, how = 'inner', on = ['contract_plan'])

In [66]:
dataset = dataset.drop(['file', 'text_contract_plan'], axis = 1)
dataset = dataset.drop_duplicates()
dataset = dataset.reset_index(inplace = True)

In [69]:
dataset

### Clean Data

In [280]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]','', text)
    text = re.sub(r'\S*@\S*\s*','', text)
    text = text.replace('\n',' ')
    text = "".join([char for char in text if char not in string.punctuation])
    
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    cleaned_words = [word for word in tokens if word not in stop_words]

    cleaned_text = ' '.join(cleaned_words)
    
    return cleaned_text

In [284]:
dataset['text_cleaned'] = dataset.apply(lambda row : clean_text(row['text']), axis = 1)

In [285]:
eoc_dataset = dataset.drop('text', axis = 1)

In [286]:
eoc_dataset

Unnamed: 0,index,County,Provider,contract_plan,PLAN_CATEGORY,Medicaid Coverage Categories (For D-SNP plans only),Enrolls All Duals (Y/N),"Types of Chronic Conditions (C-SNP only) (Diabetes, Coronary Artery Diseases, Cardiovascular Disorders, COPD, ESRD)",VBID (Y/N),Annual Max,...,Wig Chemo,Caregiver Support,Worldwide Benefit Structure,Companion Services,Behavioral Health Vendor,DME Vendor,Nurse Line Vendor,Nurse Line 24/7 (Y/N),state,text_cleaned
0,0,AL: Birmingham,"UnitedHealth Group, Inc.",H2802044000,D-SNP HMO,"QMB+, QMB, QDWI, QI, SLMB+, SLMB, FBDE",Y,,Y,"$2,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,evidence coverage 2023 unitedhealthcare dual c...
1,1,AL: Birmingham,"UnitedHealth Group, Inc.",H2802041000,Zero MAPD HMO,,,,N,"$2,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,evidence coverage 2023 aarp medicare advantage...
2,2,AL: Birmingham,"UnitedHealth Group, Inc.",H0432010000,Zero MAPD HMO,,,,Y,"$1,500",...,NC,NC,NC,NC,UNK,UNK,NurseLine,Y,Alabama,evidence coverage 2023 aarp medicare advantage...
3,3,AL: Birmingham,"UnitedHealth Group, Inc.",H6528033000,Zero MAPD PPO,,,,,"$1,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,evidence coverage 2023 aarp medicare advantage...
4,4,AL: Huntsville,"UnitedHealth Group, Inc.",H6528033000,Zero MAPD PPO,,,,,"$1,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,evidence coverage 2023 aarp medicare advantage...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,481,TX: Corpus Christi & TX: San Antonio,Humana Inc.,H5216360000,Zero MAPD PPO,,,,N,"$2,000",...,NC,"Y - Only available through SSBCI, the Humana F...",UNK,NC,,,NC,NC,Texas,h5216eocmapdppo3600002023c h5216360000eoc23 eo...
298,483,TX: El Paso & TX: Houston,CIGNA,H4513066000,Giveback MAPD HMO,,,,N,"$20,000",...,NC,N,"$50,000 (USD) combined limit per year for emer...",NC,,,UNK,Y,Texas,january 1 december 31 2023 evidence coverage m...
299,487,TX: El Paso,CIGNA,H4513061003,Zero MAPD HMO,,,,N,"$2,500 (allowance)",...,NC,Y - Services include one-on-one coaching and p...,"$50,000 (USD) combined limit per year for emer...",NC,,,UNK,Y,Texas,january 1 december 31 2023 evidence coverage m...
300,488,TX: Houston,Centene Corporation,H0174009000,Premium HMO,,,,N,"$3,000",...,NC,NC,"Coverage for up to $50,000 every year for emer...","If eligible (SSBCI), our plan provides a month...",,,UNK,Y,Texas,h01740092023txeochmapd105873ec omb approval 09...


In [302]:
len(eoc_dataset['contract_plan'].unique())

253

### Train Test Split

In [290]:
implant_coverage_df = eoc_dataset[['state', 'county', 'provider', 'contract_plan', 'text_cleaned', 'implant_coverage']]
train_implant_coverage_df, test_implant_coverage_df = train_test_split(implant_coverage_df, test_size = 0.2, stratify = implant_coverage_df['implant_coverage'], random_state = 13)

In [300]:
dataset_file = open('train_implant_coverage_df.pkl', 'ab')
pickle.dump(train_implant_coverage_df, dataset_file)
dataset_file.close()

dataset_file = open('test_implant_coverage_df.pkl', 'ab')
pickle.dump(test_implant_coverage_df, dataset_file)
dataset_file.close()

In [297]:
root_canal_df = eoc_dataset[['state', 'county', 'provider', 'contract_plan', 'text_cleaned', 'root_canal']]
train_root_canal_df, test_root_canal_df = train_test_split(root_canal_df, test_size = 0.2, stratify = root_canal_df['root_canal'], random_state = 13)

In [303]:
dataset_file = open('train_root_canal_df.pkl', 'ab')
pickle.dump(train_root_canal_df, dataset_file)
dataset_file.close()

dataset_file = open('test_root_canal_df.pkl', 'ab')
pickle.dump(test_root_canal_df, dataset_file)
dataset_file.close()

In [None]:
healthy_food_rollover_df = eoc_dataset[['state', 'county', 'provider', 'contract_plan', 'text_cleaned', 'healthy_food_rollover']]
train_healthy_food_rollover_df, test_healthy_food_rollover_df = train_test_split(healthy_food_rollover_df, test_size = 0.2, stratify = healthy_food_rollover_df['healthy_food_rollover'], random_state = 13)

In [None]:
dataset_file = open('train_healthy_food_rollover_df.pkl', 'ab')
pickle.dump(train_healthy_food_rollover_df, dataset_file)
dataset_file.close()

dataset_file = open('test_healthy_food_rollover_df.pkl', 'ab')
pickle.dump(test_healthy_food_rollover_df, dataset_file)
dataset_file.close()

In [None]:
otc_rollover_df = eoc_dataset[['state', 'county', 'provider', 'contract_plan', 'text_cleaned', 'otc_rollover']]
train_otc_rollover_df, test_otc_rollover_df = train_test_split(otc_rollover_df, test_size = 0.2, stratify = otc_rollover_df['otc_rollover'], random_state = 13)

In [None]:
dataset_file = open('train_otc_rollover_df.pkl', 'ab')
pickle.dump(train_otc_rollover_df, dataset_file)
dataset_file.close()

dataset_file = open('test_otc_rollover_df.pkl', 'ab')
pickle.dump(test_otc_rollover_df, dataset_file)
dataset_file.close()