### Import Libraries

In [193]:
from nltk.corpus import stopwords
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
import string
import fitz
import pickle
from sklearn.model_selection import train_test_split

### Load Data

In [194]:
directory = os.listdir('2023 Competitor Docs')
data = []

for state_folder in directory:
    state_path = '2023 Competitor Docs/' + state_folder
    state_directory = os.listdir(state_path)

    for county_folder in state_directory:
        county_path = state_path + '/' + county_folder
        county_directory = os.listdir(county_path)

        for provider in county_directory:
            provider_path = county_path + '/' + provider
            provider_directory = os.listdir(provider_path)

            for plan_name in provider_directory:
                plan_path = provider_path + '/' + plan_name
                plan_directory = os.listdir(plan_path)

                for file in plan_directory:
                    file_path = plan_path + '/' + file
                    pdf_file = fitz.open(file_path)

                    text = ''

                    for page in pdf_file:
                        extracted_text = page.get_text()
                        text += extracted_text + '\n\n'
                    
                    data.append([state_folder, county_folder, provider, plan_name, file, text])

documents = pd.DataFrame(data, columns = ['state', 'county', 'provider', 'plan_name', 'file', 'text'])

### Load Target

In [196]:
benefits_grid = pd.read_csv('Benefits Grid.csv')

In [197]:
benefits_grid = benefits_grid.rename(columns = {'CONTRACT_PLAN': 'contract_plan'})
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace(' ', '')

### Join Documents to Target

In [199]:
plan_list = benefits_grid[benefits_grid['contract_plan'] != 'EOC']['contract_plan']

In [200]:
def find_contract_plan(file_name):
    for plan in plan_list:
        if plan in file_name:
            return plan

In [201]:
documents['contract_plan_file'] = documents.apply(lambda row : find_contract_plan(row['file']), axis = 1)
documents['contract_plan_text'] = documents.apply(lambda row : find_contract_plan(row['text']), axis = 1)
documents['contract_plan'] = documents.contract_plan_file.combine_first(documents.contract_plan_text)
documents = documents.drop(['contract_plan_text', 'contract_plan_file'], axis = 1)

In [202]:
documents.head()

Unnamed: 0,state,county,provider,plan_name,file,text,contract_plan
0,Illinois,IL - Chicago,UnitedHealthcare,NEW $0 HMO-POS (H2802-054-000),H2802-054-000_UHC_IL_Chicago_HMOPOS_BenefitHig...,Benefit Highlights\nAARP® Medicare Advantage P...,
1,Illinois,IL - Chicago,UnitedHealthcare,NEW $0 HMO-POS (H2802-054-000),H2802-054-000_UHC_IL_Chicago_HMOPOS_VendorShee...,Vendor Information\nAARP® Medicare Advantage P...,
2,Illinois,IL - Chicago,UnitedHealthcare,NEW $0 HMO-POS (H2802-054-000),H2802-054-000_UHC_IL_Chicago_HMOPOS_SOB.pdf,Summary of\nBenefits 2023\nAARP® Medicare Adva...,
3,Illinois,IL - Chicago,UnitedHealthcare,Access $0 HMO-POS (H2802-024-000),H2802-024-000_UHC_IL_Chicago_Access_BenefitHig...,Benefit Highlights\nAARP® Medicare Advantage A...,
4,Illinois,IL - Chicago,UnitedHealthcare,Access $0 HMO-POS (H2802-024-000),H2802-024-000_UHC_IL_Chicago_Access_SOB.pdf,Summary of\nBenefits 2023\nAARP® Medicare Adva...,


In [220]:
dataset = pd.merge(benefits_grid, documents, how = 'inner', on = ['contract_plan'])

### Clean Data

In [224]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]','', text)
    text = re.sub(r'\S*@\S*\s*','', text)
    text = text.replace('\n',' ')
    text = "".join([char for char in text if char not in string.punctuation])
    
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    cleaned_words = [word for word in tokens if word not in stop_words]

    cleaned_text = ' '.join(cleaned_words)
    
    return cleaned_text

In [225]:
dataset['text_cleaned'] = dataset.apply(lambda row : clean_text(row['text']), axis = 1)

In [226]:
eoc_dataset = dataset[dataset['file'].str.contains('EOC')]
eoc_dataset = eoc_dataset.drop('text', axis = 1)

In [227]:
eoc_dataset

Unnamed: 0,County,Provider,contract_plan,PLAN_CATEGORY,Medicaid Coverage Categories (For D-SNP plans only),Enrolls All Duals (Y/N),"Types of Chronic Conditions (C-SNP only) (Diabetes, Coronary Artery Diseases, Cardiovascular Disorders, COPD, ESRD)",VBID (Y/N),Annual Max,Annual Max - Shared Allowance (Y/N),...,Behavioral Health Vendor,DME Vendor,Nurse Line Vendor,Nurse Line 24/7 (Y/N),state,county,provider,plan_name,file,text_cleaned
4,CO: Denver,"Kaiser Foundation Health Plan, Inc.",H0630-014-000,D-SNP HMO,UNK,UNK,,N,"$1,000",Y,...,UNK,UNK,UNK,Y,Colorado,CO - Denver,Kaiser (Additional Pull 12-19),DSNP Denver,H0630-014-000_Kaiser_CO_Denver_DSNPHMO_EOC.pdf,h063023002dbc pbp 014 omb approval 09381051 ex...
9,CO: Denver Metro,"UnitedHealth Group, Inc.",H0609-048-000,Zero MAPD HMO,,,,,"$2,000",Y,...,UNK,UNK,NurseLine,Y,Colorado,CO - Denver,UnitedHealthcare,$0 HMO-POS Plan 1 (H0609-048-000),H0609-048-000_UHC_CO_Denver_Plan1HMOPOS_EOC.pdf,evidence coverage 2023 aarp medicare advantage...
12,CO: Denver Metro,"UnitedHealth Group, Inc.",H2577-002-000,Zero MAPD PPO,,,,,$500,Y,...,UNK,UNK,NurseLine,Y,Colorado,CO - Denver,UnitedHealthcare,$0 PPO (H2577-002-000),H2577-002-000_UHC_CO_Denver_ZeroPPO_EOC.pdf,evidence coverage 2023 aarp medicare advantage...
18,CO: El Paso/Teller,"UnitedHealth Group, Inc.",H2577-001-001,Zero MAPD PPO,,,,,"$1,500 (rider)",Y,...,UNK,UNK,NurseLine,Y,Colorado,CO - El Paso-Teller,UnitedHealthcare,$0 PPO (H2577-001-001),H2577-001-001_UHC_CO_ElPasoTeller_ZeroPPO_EOC.pdf,evidence coverage 2023 aarp medicare advantage...
23,FL: Daytona,"UnitedHealth Group, Inc.",H1045-039-000,D-SNP HMO,"FBDE, QDWI, QI, QMB, QMB+, SLMB, SLMB+",Y,,Y,"$4,000",Y,...,UNK,UNK,NurseLine,Y,Florida,FL - Manatee,UnitedHealthcare,LP HMO-POS D-SNP (H1045-039-000),H1045-039-000_UHC_FL_Manatee_LPDSNP_EOC.pdf,evidence coverage 2023 unitedhealthcare dual c...
27,FL: Jacksonville,"UnitedHealth Group, Inc.",H1045-039-000,D-SNP HMO,"FBDE, QDWI, QI, QMB, QMB+, SLMB, SLMB+",Y,,Y,"$4,000",Y,...,UNK,UNK,NurseLine,Y,Florida,FL - Manatee,UnitedHealthcare,LP HMO-POS D-SNP (H1045-039-000),H1045-039-000_UHC_FL_Manatee_LPDSNP_EOC.pdf,evidence coverage 2023 unitedhealthcare dual c...
31,FL: LMS,"UnitedHealth Group, Inc.",H1045-039-000,D-SNP HMO,"FBDE, QDWI, QI, QMB, QMB+, SLMB, SLMB+",Y,,Y,"$4,000",Y,...,UNK,UNK,NurseLine,Y,Florida,FL - Manatee,UnitedHealthcare,LP HMO-POS D-SNP (H1045-039-000),H1045-039-000_UHC_FL_Manatee_LPDSNP_EOC.pdf,evidence coverage 2023 unitedhealthcare dual c...
35,FL: Manatee,"UnitedHealth Group, Inc.",H1045-039-000,D-SNP HMO,"FBDE, QDWI, QI, QMB, QMB+, SLMB, SLMB+",Y,,Y,"$4,000",Y,...,UNK,UNK,NurseLine,Y,Florida,FL - Manatee,UnitedHealthcare,LP HMO-POS D-SNP (H1045-039-000),H1045-039-000_UHC_FL_Manatee_LPDSNP_EOC.pdf,evidence coverage 2023 unitedhealthcare dual c...
39,FL: Orlando,"UnitedHealth Group, Inc.",H1045-039-000,D-SNP HMO,"FBDE, QDWI, QI, QMB, QMB+, SLMB, SLMB+",Y,,,"$4,000",Y,...,UNK,UNK,NurseLine,Y,Florida,FL - Manatee,UnitedHealthcare,LP HMO-POS D-SNP (H1045-039-000),H1045-039-000_UHC_FL_Manatee_LPDSNP_EOC.pdf,evidence coverage 2023 unitedhealthcare dual c...
43,FL: Tampa,"UnitedHealth Group, Inc.",H1045-039-000,D-SNP HMO,"FBDE, QDWI, QI, QMB, QMB+, SLMB, SLMB+",Y,,Y (Food OTC Utility Credit & Part D drugs),"$4,000",Y,...,UNK,UNK,NurseLine,Y,Florida,FL - Manatee,UnitedHealthcare,LP HMO-POS D-SNP (H1045-039-000),H1045-039-000_UHC_FL_Manatee_LPDSNP_EOC.pdf,evidence coverage 2023 unitedhealthcare dual c...


### Train Test Split

In [238]:
vbid_df = eoc_dataset[['state', 'county', 'provider', 'contract_plan', 'text_cleaned', 'VBID (Y/N)']]
vbid_df = vbid_df.rename(columns = {'VBID (Y/N)': 'vbid'})
vbid_df = vbid_df[vbid_df['vbid'].notna()]
vbid_df = vbid_df.replace(to_replace = 'Y (Food OTC Utility Credit & Part D drugs)', value = 'Y') 
train_vbid_df, test_vbid_df = train_test_split(vbid_df, test_size = 0.2, stratify = vbid_df['vbid'], random_state = 13)

In [233]:
dataset_file = open('train_vbid_df.pkl', 'ab')
pickle.dump(train_vbid_df, dataset_file)
dataset_file.close()

dataset_file = open('test_vbid_df.pkl', 'ab')
pickle.dump(test_vbid_df, dataset_file)
dataset_file.close()

In [241]:
otc_rollover_df = eoc_dataset[['state', 'county', 'provider', 'contract_plan', 'text_cleaned', 'OTC Rollover (Y/N)']]
otc_rollover_df = otc_rollover_df.rename(columns = {'OTC Rollover (Y/N)': 'otc_rollover'})
otc_rollover_df = otc_rollover_df[otc_rollover_df['otc_rollover'].notna()]
train_otc_rollover_df, test_otc_rollover_df = train_test_split(otc_rollover_df, test_size = 0.2, random_state = 13)