### Import Libraries

In [242]:
from nltk.corpus import stopwords
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
import string
import fitz
import pickle
from sklearn.model_selection import train_test_split

### Load Data

In [244]:
directory = os.listdir('2023 Competitor Docs')
data = []

for state_folder in directory:
    state_path = '2023 Competitor Docs/' + state_folder
    state_directory = os.listdir(state_path)

    for file in state_directory:
        file_path = state_path + '/' + file
        pdf_file = fitz.open(file_path)

        text = ''

        for page in pdf_file:
            extracted_text = page.get_text()
            text += extracted_text + '\n\n'
                    
        data.append([state_folder, file, text])

documents = pd.DataFrame(data, columns = ['state', 'file', 'text'])

### Load Target

In [245]:
benefits_grid = pd.read_csv('Benefits Grid.csv')

In [246]:
benefits_grid = benefits_grid.rename(columns = {'CONTRACT_PLAN': 'contract_plan'})
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace(' ', '')
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace('-', '')
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace('_', '')

### Join Documents to Target

In [247]:
plan_list = benefits_grid[benefits_grid['contract_plan'] != 'EOC']['contract_plan']

In [275]:
documents['file'] = documents['file'].str.replace(' ', '')
documents['file'] = documents['file'].str.replace('-', '')
documents['file'] = documents['file'].str.replace('_', '')

documents['text_contract_plan'] = documents['text'].str.replace(' ', '')
documents['text_contract_plan'] = documents['text_contract_plan'].str.replace('-', '')
documents['text_contract_plan'] = documents['text_contract_plan'].str.replace('_', '')

In [250]:
def find_contract_plan(file_name):
    for plan in plan_list:
        if plan in file_name:
            return plan

In [276]:
documents['contract_plan_file'] = documents.apply(lambda row : find_contract_plan(row['file']), axis = 1)
documents['contract_plan_text'] = documents.apply(lambda row : find_contract_plan(row['text_contract_plan']), axis = 1)
documents['contract_plan'] = documents.contract_plan_file.combine_first(documents.contract_plan_text)
documents = documents.drop(['contract_plan_text', 'contract_plan_file'], axis = 1)

In [277]:
dataset = pd.merge(benefits_grid, documents, how = 'inner', on = ['contract_plan'])

In [278]:
dataset = dataset.drop(['file', 'text_contract_plan'], axis = 1)
dataset = dataset.drop_duplicates()
dataset = dataset.reset_index()

In [279]:
dataset

Unnamed: 0,index,County,Provider,contract_plan,PLAN_CATEGORY,Medicaid Coverage Categories (For D-SNP plans only),Enrolls All Duals (Y/N),"Types of Chronic Conditions (C-SNP only) (Diabetes, Coronary Artery Diseases, Cardiovascular Disorders, COPD, ESRD)",VBID (Y/N),Annual Max,...,Wig Chemo,Caregiver Support,Worldwide Benefit Structure,Companion Services,Behavioral Health Vendor,DME Vendor,Nurse Line Vendor,Nurse Line 24/7 (Y/N),state,text
0,0,AL: Birmingham,"UnitedHealth Group, Inc.",H2802044000,D-SNP HMO,"QMB+, QMB, QDWI, QI, SLMB+, SLMB, FBDE",Y,,Y,"$2,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,Evidence of \nCoverage 2023\nUnitedHealthcare ...
1,1,AL: Birmingham,"UnitedHealth Group, Inc.",H2802041000,Zero MAPD HMO,,,,N,"$2,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,Evidence of \nCoverage 2023\nAARP® Medicare Ad...
2,2,AL: Birmingham,"UnitedHealth Group, Inc.",H0432010000,Zero MAPD HMO,,,,Y,"$1,500",...,NC,NC,NC,NC,UNK,UNK,NurseLine,Y,Alabama,Evidence of \nCoverage 2023\nAARP® Medicare Ad...
3,3,AL: Birmingham,"UnitedHealth Group, Inc.",H6528033000,Zero MAPD PPO,,,,,"$1,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,Evidence of \nCoverage 2023\nAARP® Medicare Ad...
4,4,AL: Huntsville,"UnitedHealth Group, Inc.",H6528033000,Zero MAPD PPO,,,,,"$1,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,Evidence of \nCoverage 2023\nAARP® Medicare Ad...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,481,TX: Corpus Christi & TX: San Antonio,Humana Inc.,H5216360000,Zero MAPD PPO,,,,N,"$2,000",...,NC,"Y - Only available through SSBCI, the Humana F...",UNK,NC,,,NC,NC,Texas,H5216_EOC_MAPD_PPO_360000_2023_C\nH5216360000E...
298,483,TX: El Paso & TX: Houston,CIGNA,H4513066000,Giveback MAPD HMO,,,,N,"$20,000",...,NC,N,"$50,000 (USD) combined limit per year for emer...",NC,,,UNK,Y,Texas,"January 1 - December 31, 2023\nEVIDENCE OF COV..."
299,487,TX: El Paso,CIGNA,H4513061003,Zero MAPD HMO,,,,N,"$2,500 (allowance)",...,NC,Y - Services include one-on-one coaching and p...,"$50,000 (USD) combined limit per year for emer...",NC,,,UNK,Y,Texas,"January 1 - December 31, 2023\nEVIDENCE OF COV..."
300,488,TX: Houston,Centene Corporation,H0174009000,Premium HMO,,,,N,"$3,000",...,NC,NC,"Coverage for up to $50,000 every year for emer...","If eligible (SSBCI), our plan provides a month...",,,UNK,Y,Texas,H0174_009_2023_TX_EOC_HMAPD_105873E_C \nOMB Ap...


### Clean Data

In [280]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]','', text)
    text = re.sub(r'\S*@\S*\s*','', text)
    text = text.replace('\n',' ')
    text = "".join([char for char in text if char not in string.punctuation])
    
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    cleaned_words = [word for word in tokens if word not in stop_words]

    cleaned_text = ' '.join(cleaned_words)
    
    return cleaned_text

In [284]:
dataset['text_cleaned'] = dataset.apply(lambda row : clean_text(row['text']), axis = 1)

In [285]:
eoc_dataset = dataset.drop('text', axis = 1)

In [286]:
eoc_dataset

Unnamed: 0,index,County,Provider,contract_plan,PLAN_CATEGORY,Medicaid Coverage Categories (For D-SNP plans only),Enrolls All Duals (Y/N),"Types of Chronic Conditions (C-SNP only) (Diabetes, Coronary Artery Diseases, Cardiovascular Disorders, COPD, ESRD)",VBID (Y/N),Annual Max,...,Wig Chemo,Caregiver Support,Worldwide Benefit Structure,Companion Services,Behavioral Health Vendor,DME Vendor,Nurse Line Vendor,Nurse Line 24/7 (Y/N),state,text_cleaned
0,0,AL: Birmingham,"UnitedHealth Group, Inc.",H2802044000,D-SNP HMO,"QMB+, QMB, QDWI, QI, SLMB+, SLMB, FBDE",Y,,Y,"$2,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,evidence coverage 2023 unitedhealthcare dual c...
1,1,AL: Birmingham,"UnitedHealth Group, Inc.",H2802041000,Zero MAPD HMO,,,,N,"$2,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,evidence coverage 2023 aarp medicare advantage...
2,2,AL: Birmingham,"UnitedHealth Group, Inc.",H0432010000,Zero MAPD HMO,,,,Y,"$1,500",...,NC,NC,NC,NC,UNK,UNK,NurseLine,Y,Alabama,evidence coverage 2023 aarp medicare advantage...
3,3,AL: Birmingham,"UnitedHealth Group, Inc.",H6528033000,Zero MAPD PPO,,,,,"$1,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,evidence coverage 2023 aarp medicare advantage...
4,4,AL: Huntsville,"UnitedHealth Group, Inc.",H6528033000,Zero MAPD PPO,,,,,"$1,000",...,NC,NC,UNK,NC,UNK,UNK,NurseLine,Y,Alabama,evidence coverage 2023 aarp medicare advantage...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,481,TX: Corpus Christi & TX: San Antonio,Humana Inc.,H5216360000,Zero MAPD PPO,,,,N,"$2,000",...,NC,"Y - Only available through SSBCI, the Humana F...",UNK,NC,,,NC,NC,Texas,h5216eocmapdppo3600002023c h5216360000eoc23 eo...
298,483,TX: El Paso & TX: Houston,CIGNA,H4513066000,Giveback MAPD HMO,,,,N,"$20,000",...,NC,N,"$50,000 (USD) combined limit per year for emer...",NC,,,UNK,Y,Texas,january 1 december 31 2023 evidence coverage m...
299,487,TX: El Paso,CIGNA,H4513061003,Zero MAPD HMO,,,,N,"$2,500 (allowance)",...,NC,Y - Services include one-on-one coaching and p...,"$50,000 (USD) combined limit per year for emer...",NC,,,UNK,Y,Texas,january 1 december 31 2023 evidence coverage m...
300,488,TX: Houston,Centene Corporation,H0174009000,Premium HMO,,,,N,"$3,000",...,NC,NC,"Coverage for up to $50,000 every year for emer...","If eligible (SSBCI), our plan provides a month...",,,UNK,Y,Texas,h01740092023txeochmapd105873ec omb approval 09...


In [302]:
len(eoc_dataset['contract_plan'].unique())

253

### Train Test Split

In [294]:
eoc_dataset['OTC Rollover (Y/N)'].unique()

array(['N', nan, 'NC', 'Y', 'N ', 'UNK', 'Y ', 'Y  ', 'N  ',
       'UNK -- no mention in EOC',
       'Y -- carries over each month and expires at the end of the year',
       'Y -- $20 monthly allowance rolls over to next month and expires at the end of the year',
       'Y -- $35 monthly allowance rolls over each month and expires at the end of the year',
       'N -- note that EOC states that exceptions may apply.',
       'Y -- $30 monthly allowance rolls over each month and expires at the end of the year'],
      dtype=object)

In [290]:
vbid_df = eoc_dataset[['state', 'County', 'Provider', 'contract_plan', 'text_cleaned', 'VBID (Y/N)']]
vbid_df = vbid_df.rename(columns = {'VBID (Y/N)': 'vbid'})
vbid_df = vbid_df[vbid_df['vbid'].notna()]
vbid_df = vbid_df.replace(to_replace = 'Y (Food OTC Utility Credit & Part D drugs)', value = 'Y')
vbid_df['vbid'] = vbid_df['vbid'].str.replace(' ', '')
train_vbid_df, test_vbid_df = train_test_split(vbid_df, test_size = 0.2, stratify = vbid_df['vbid'], random_state = 13)

In [300]:
dataset_file = open('train_vbid_df.pkl', 'ab')
pickle.dump(train_vbid_df, dataset_file)
dataset_file.close()

dataset_file = open('test_vbid_df.pkl', 'ab')
pickle.dump(test_vbid_df, dataset_file)
dataset_file.close()

In [297]:
otc_rollover_df = eoc_dataset[['state', 'County', 'Provider', 'contract_plan', 'text_cleaned', 'OTC Rollover (Y/N)']]
otc_rollover_df = otc_rollover_df.rename(columns = {'OTC Rollover (Y/N)': 'otc_rollover'})
otc_rollover_df = otc_rollover_df[otc_rollover_df['otc_rollover'].notna()]
otc_rollover_df = otc_rollover_df.replace(to_replace = 'UNK -- no mention in EOC', value = 'UNK')
otc_rollover_df = otc_rollover_df.replace(to_replace = ['Y -- carries over each month and expires at the end of the year', 'Y -- $20 monthly allowance rolls over to next month and expires at the end of the year', 'Y -- $35 monthly allowance rolls over each month and expires at the end of the year', 'Y -- $30 monthly allowance rolls over each month and expires at the end of the year'], value = 'Y')
otc_rollover_df = otc_rollover_df.replace(to_replace = 'N -- note that EOC states that exceptions may apply.', value = 'N')
otc_rollover_df['otc_rollover'] = otc_rollover_df['otc_rollover'].str.replace(' ', '')
train_otc_rollover_df, test_otc_rollover_df = train_test_split(otc_rollover_df, test_size = 0.2, stratify = otc_rollover_df['otc_rollover'], random_state = 13)

In [303]:
dataset_file = open('train_otc_rollover_df.pkl', 'ab')
pickle.dump(train_otc_rollover_df, dataset_file)
dataset_file.close()

dataset_file = open('test_otc_rollover_df.pkl', 'ab')
pickle.dump(test_otc_rollover_df, dataset_file)
dataset_file.close()