### Import Libraries

In [1]:
from nltk.corpus import stopwords
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
import string
import fitz
import pickle
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, roc_auc_score, roc_curve

### Load Data

In [2]:
directory = os.listdir('2023 Competitor Docs')
data = []

for state_folder in directory:
    state_path = '2023 Competitor Docs/' + state_folder
    state_directory = os.listdir(state_path)

    for file in state_directory:
        file_path = state_path + '/' + file
        pdf_file = fitz.open(file_path)

        text = ''

        for page in pdf_file:
            extracted_text = page.get_text()
            text += extracted_text + '\n\n'
                    
        data.append([state_folder, file, text])

documents = pd.DataFrame(data, columns = ['state', 'file', 'text'])

In [18]:
#Previous code
# directory = os.listdir('2023 Competitor Docs')
# data = []

# for state_folder in directory:
#     state_path = '2023 Competitor Docs/' + state_folder
#     state_directory = os.listdir(state_path)

#     for county_folder in state_directory:
#         county_path = state_path + '/' + county_folder
#         county_directory = os.listdir(county_path)

#         for provider in county_directory:
#             provider_path = county_path + '/' + provider
#             provider_directory = os.listdir(provider_path)

#             for plan_name in provider_directory:
#                 plan_path = provider_path + '/' + plan_name
#                 plan_directory = os.listdir(plan_path)

#                 for file in plan_directory:
#                     file_path = plan_path + '/' + file
#                     pdf_file = fitz.open(file_path)

#                     text = ''

#                     for page in pdf_file:
#                         extracted_text = page.get_text()
#                         text += extracted_text + '\n\n'
                    
#                     data.append([state_folder, county_folder, provider, plan_name, file, text])

# documents = pd.DataFrame(data, columns = ['state', 'county', 'provider', 'plan_name', 'file', 'text'])

In [3]:
#Strip spaces and dashes from file names
documents['file'] = documents['file'].str.replace('-', '') #strip dashes
documents['file'] = documents['file'].str.replace(' ', '') #strip spaces
documents['file'] = documents['file'].str.replace('_', '') #strip underscores

In [4]:
documents

Unnamed: 0,state,file,text
0,Alabama,AetnaMedicareEaglePlan(PPOH5521‑229)EOC.pdf,2023 Evidence of Coverage for Aetna Medicare E...
1,Alabama,AetnaMedicareFreedomPlan(PPOH5521‑227)EOC.pdf,2023 Evidence of Coverage for Aetna Medicare F...
2,Alabama,AetnaALBirminghamFreedom(PPO)EOC.pdf,2023 Evidence of Coverage for Aetna Medicare F...
3,Alabama,AetnaALHuntsvilleDualPreferred(DSNP)EOC(1).pdf,2023 Evidence of Coverage for Aetna Medicare D...
4,Alabama,AetnaALHuntsvilleDualPreferred(DSNP)EOC.pdf,2023 Evidence of Coverage for Aetna Medicare D...
...,...,...,...
1791,Texas,WellcareNoPremiumOpen(PPO)EOC.pdf,H7323_009_2023_TX_EOC_PPO_106200E_C \nOMB Appr...
1792,Texas,WellcareNoPremiumRxPlusOpen(PPO)EOC.pdf,H7323_006_2023_TX_EOC_PPO_106222E_C \nOMB Appr...
1793,Texas,WellcarePatriotNoPremium(HMO)EOC.pdf,H5294_014_2023_TX_EOC_HMAO_106248E_C \nOMB App...
1794,Texas,WellcareTexanPlusNoPremium(HMO)EOC(1).pdf,H0174_002_2023_TX_EOC_HMAPD_106284E_C \nOMB Ap...


In [5]:
#Count distinct file names
print(len(pd.unique(documents['file'])))

1762


### Load and Clean Targets

In [6]:
benefits_grid = pd.read_csv('Benefits Grid.csv')

In [7]:
benefits_grid = benefits_grid.rename(columns = {'CONTRACT_PLAN': 'contract_plan'})
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace(' ', '')
benefits_grid['contract_plan'] = benefits_grid['contract_plan'].str.replace('-', '') #strip dashes

In [8]:
#Clean Implant coverage target
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.replace('with rider?', 'Y') #Change with rider
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.strip() #Strip spaces
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.replace('?', '') #Strip ?
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.replace('Unknown', 'N') #Change Unknown to N
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].astype(str).str[0] #Get only first character

In [9]:
#Clean Root Canal coverage target
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.replace('with rider?', 'Y') #Change with rider
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.strip() #Strip spaces
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.replace('?', '') #Strip ?
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.replace('Unknown', 'N') #Change Unknown to N
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].astype(str).str[0] #Get only first character

In [10]:
#Healthy food rollover target
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.strip() #Strip spaces
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.replace('UNK', 'N')  #Change Unknown to N
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.replace('N/A', 'N')  #Change N/A to N
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.replace('NC', 'N')  #Change NC to N
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].astype(str).str[0] #Get only first character

In [11]:
#OTC rollover target
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.strip() #Strip spaces
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.replace('UNK', 'N')  #Change Unknown to N
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.replace('N/A', 'N')  #Change N/A to N
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.replace('NC', 'N')  #Change NC to N
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].astype(str).str[0] #Get only first character

In [12]:
benefits_grid = benefits_grid.replace(np.nan, 'N')

In [13]:
benefits_grid['Implant Coverage (Y/N)'] = benefits_grid['Implant Coverage (Y/N)'].str.upper()
benefits_grid['Root Canal Coverage (Y/N)'] = benefits_grid['Root Canal Coverage (Y/N)'].str.upper()
benefits_grid['Healthy Food Rollover'] = benefits_grid['Healthy Food Rollover'].str.upper()
benefits_grid['OTC Rollover (Y/N)'] = benefits_grid['OTC Rollover (Y/N)'].str.upper()

### Join Documents to Target & Clean Text

In [14]:
plan_list = benefits_grid[benefits_grid['contract_plan'] != 'EOC']['contract_plan']

In [15]:
def find_contract_plan(file_name):
    for plan in plan_list:
        if plan in file_name:
            return plan

In [16]:
documents['contract_plan_file'] = documents.apply(lambda row : find_contract_plan(row['file']), axis = 1)
documents['contract_plan_text'] = documents.apply(lambda row : find_contract_plan(row['text']), axis = 1)
documents['contract_plan'] = documents.contract_plan_file.combine_first(documents.contract_plan_text)
#documents = documents.drop(['contract_plan_text', 'contract_plan_file'], axis = 1)

In [17]:
documents

Unnamed: 0,state,file,text,contract_plan_file,contract_plan_text,contract_plan
0,Alabama,AetnaMedicareEaglePlan(PPOH5521‑229)EOC.pdf,2023 Evidence of Coverage for Aetna Medicare E...,,,
1,Alabama,AetnaMedicareFreedomPlan(PPOH5521‑227)EOC.pdf,2023 Evidence of Coverage for Aetna Medicare F...,,,
2,Alabama,AetnaALBirminghamFreedom(PPO)EOC.pdf,2023 Evidence of Coverage for Aetna Medicare F...,,,
3,Alabama,AetnaALHuntsvilleDualPreferred(DSNP)EOC(1).pdf,2023 Evidence of Coverage for Aetna Medicare D...,,,
4,Alabama,AetnaALHuntsvilleDualPreferred(DSNP)EOC.pdf,2023 Evidence of Coverage for Aetna Medicare D...,,,
...,...,...,...,...,...,...
1791,Texas,WellcareNoPremiumOpen(PPO)EOC.pdf,H7323_009_2023_TX_EOC_PPO_106200E_C \nOMB Appr...,,,
1792,Texas,WellcareNoPremiumRxPlusOpen(PPO)EOC.pdf,H7323_006_2023_TX_EOC_PPO_106222E_C \nOMB Appr...,,,
1793,Texas,WellcarePatriotNoPremium(HMO)EOC.pdf,H5294_014_2023_TX_EOC_HMAO_106248E_C \nOMB App...,,,
1794,Texas,WellcareTexanPlusNoPremium(HMO)EOC(1).pdf,H0174_002_2023_TX_EOC_HMAPD_106284E_C \nOMB Ap...,,,


In [18]:
#Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]','', text)
    text = re.sub(r'\S*@\S*\s*','', text)
    text = text.replace('\n',' ')
    text = "".join([char for char in text if char not in string.punctuation])
    
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    cleaned_words = [word for word in tokens if word not in stop_words]

    cleaned_text = ' '.join(cleaned_words)
    
    return cleaned_text

In [19]:
documents['text_cleaned'] = documents.apply(lambda row : clean_text(row['text']), axis = 1)

In [20]:
#Export files
#files = documents[['state', 'file','contract_plan_file','contract_plan_text','contract_plan']]
#files.to_csv('files.csv', index = False)

In [21]:
#Merge document data with benefits grid
dataset = pd.merge(benefits_grid, documents, how = 'inner', on = ['contract_plan'])

In [22]:
dataset

Unnamed: 0,County,Provider,contract_plan,PLAN_CATEGORY,Medicaid Coverage Categories (For D-SNP plans only),Enrolls All Duals (Y/N),"Types of Chronic Conditions (C-SNP only) (Diabetes, Coronary Artery Diseases, Cardiovascular Disorders, COPD, ESRD)",VBID (Y/N),Annual Max,Annual Max - Shared Allowance (Y/N),...,Behavioral Health Vendor,DME Vendor,Nurse Line Vendor,Nurse Line 24/7 (Y/N),state,file,text,contract_plan_file,contract_plan_text,text_cleaned
0,AL: Birmingham,"UnitedHealth Group, Inc.",H0432009000,D-SNP HMO,"FBDE, QDWI, QI, QMB, QMB+, SLMB, SLMB+",Y,N,Y,"$3,000",Y,...,UNK,UNK,NurseLine,Y,Alabama,H0432009000UHCALBirminghamFullDualEOC.pdf,Evidence of \nCoverage 2023\nUnitedHealthcare ...,H0432009000,,evidence coverage 2023 unitedhealthcare dual c...
1,AL: Huntsville,"UnitedHealth Group, Inc.",H0432009000,D-SNP HMO,"QMB, QMB+, SLMB, SLMB+, QI, QDWI, FBDE",Y,N,Y,"$3,000",Y,...,UNK,UNK,NurseLine,Y,Alabama,H0432009000UHCALBirminghamFullDualEOC.pdf,Evidence of \nCoverage 2023\nUnitedHealthcare ...,H0432009000,,evidence coverage 2023 unitedhealthcare dual c...
2,AL: Birmingham,"UnitedHealth Group, Inc.",H2802044000,D-SNP HMO,"QMB+, QMB, QDWI, QI, SLMB+, SLMB, FBDE",Y,N,Y,"$2,000",Y,...,UNK,UNK,NurseLine,Y,Alabama,H2802044000UHCALBirminghamPartialDualEOC.pdf,Evidence of \nCoverage 2023\nUnitedHealthcare ...,H2802044000,,evidence coverage 2023 unitedhealthcare dual c...
3,AL: Birmingham,"UnitedHealth Group, Inc.",H2802041000,Zero MAPD HMO,N,N,N,N,"$2,000",Y,...,UNK,UNK,NurseLine,Y,Alabama,H2802041000UHCALBirminghamPlan3HMOEOC.pdf,Evidence of \nCoverage 2023\nAARP® Medicare Ad...,H2802041000,,evidence coverage 2023 aarp medicare advantage...
4,AL: Birmingham,"Triton Health Systems, LLC",H0154015001,Zero MAPD HMO,N,N,N,N,"$1,000",Y,...,UNK,UNK,Viva Health Contractor,Y,Alabama,H0154015001VIvaALBirminghamPlusANOC(1).pdf,VIVA MEDICARE Plus Annual Notice of Changes fo...,H0154015001,,viva medicare plus annual notice changes 2023 ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,TX: Houston,Centene Corporation,H0174009000,Premium HMO,N,N,N,N,"$3,000",Y,...,N,N,UNK,Y,Texas,WellcareAssist(HMO)EOC.pdf,H0174_009_2023_TX_EOC_HMAPD_105873E_C \nOMB Ap...,,H0174009000,h01740092023txeochmapd105873ec omb approval 09...
586,TX: Houston,Memorial Hermann Health System,H7115003000,Premium HMO,N,N,N,N,"$2,500",Y,...,N,N,UNK,Y,Texas,H7115003000MemorialHermannAdvantagePlusHMOEOC.pdf,Memorial Hermann Advantage Plus HMO\n2023 Evid...,H7115003000,,memorial hermann advantage plus hmo 2023 evide...
587,TX: Houston,"UnitedHealth Group, Inc.",H0332008000,Premium HMO,N,N,N,N,N/A?,?,...,N,N,UNK,Y,Texas,H0332008000GoldCommunityHMOPOSEOC.pdf,H0332_001EOC22_C\n1-866-535-8343 (TTY: 711) ...,H0332008000,,h0332001eoc22c 18665358343 tty 711 wwwkelseyca...
588,TX: San Antonio,"UnitedHealth Group, Inc.",H1278005000,Zero MAPD PPO,N,N,N,N,"$500/$1,500 (optional)",Y,...,N,N,NurseLine,Y,Texas,H1278005000UHCTXSanAntonioPPOEOC.pdf,Evidence of \nCoverage 2023\nAARP® Medicare Ad...,H1278005000,,evidence coverage 2023 aarp medicare advantage...


In [23]:
#Count distinct matches
print(len(pd.unique(dataset['contract_plan'])))

343


In [24]:
#Export matches
#matches = dataset[['County','Provider','contract_plan','state', 'file']]
#matches.to_csv('matches.csv', index = False)

In [25]:
#matches

In [26]:
benefits_grid = benefits_grid.rename(columns = {'CONTRACT_PLAN': 'contract_plan'})

In [27]:
eoc_dataset = dataset[dataset['file'].str.contains('EOC')]
eoc_dataset = eoc_dataset.drop('text', axis = 1)

In [28]:
#eoc_sample = vbid_df[['state', 'County', 'Provider', 'contract_plan', 'vbid']]
#eoc_sample.to_csv('eoc.csv', index = False)

### Train Test Split

In [29]:
#Create smaller dataset sample to test with
#eoc_dataset = eoc_dataset.sample(n=161)

In [29]:
#Create dataset with targets
model_df = eoc_dataset[['County', 'Provider', 'contract_plan', 'text_cleaned', 'Implant Coverage (Y/N)','Root Canal Coverage (Y/N)',
                 'Healthy Food Rollover','OTC Rollover (Y/N)']]

In [30]:
#Change Y and N to 0 and 1
model_df['Implant Coverage (Y/N)'] = model_df['Implant Coverage (Y/N)'].map({'Y': 1, 'N': 0})
model_df['Root Canal Coverage (Y/N)'] = model_df['Root Canal Coverage (Y/N)'].map({'Y': 1, 'N': 0})
model_df['Healthy Food Rollover'] = model_df['Healthy Food Rollover'].map({'Y': 1, 'N': 0})
model_df['OTC Rollover (Y/N)'] = model_df['OTC Rollover (Y/N)'].map({'Y': 1, 'N': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['Implant Coverage (Y/N)'] = model_df['Implant Coverage (Y/N)'].map({'Y': 1, 'N': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['Root Canal Coverage (Y/N)'] = model_df['Root Canal Coverage (Y/N)'].map({'Y': 1, 'N': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['H

In [31]:
train_df, test_df = train_test_split(model_df, test_size = 0.2, random_state = 11)

In [32]:
#test_df

In [33]:
#dataset_file = open('train_vbid_df.pkl', 'ab')
#pickle.dump(train_vbid_df, dataset_file)
#dataset_file.close()

#dataset_file = open('test_vbid_df.pkl', 'ab')
#pickle.dump(test_vbid_df, dataset_file)
#dataset_file.close()

### 1. Predicting Implant Coverage

In [35]:
X_train = train_df['text_cleaned']
y_train = train_df['Implant Coverage (Y/N)']

X_test = test_df['text_cleaned']
y_test = test_df['Implant Coverage (Y/N)']

In [36]:
# Setup SVM using CountVectorizer, TFIDF, unigrams, bigrams, & different hyperparameters

#parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
#              'mnb__fit_prior': [True, False]}

#

parameters = {
    'svc__C': [0.01, 0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf','sigmoid'],
    'svc__gamma': ['scale', 'auto']
}

svc_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('svc', SVC())])
svc_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('svc', SVC())])
svc_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('svc', SVC())])
svc_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('svc', SVC())])

svc_cv_uni_grid = GridSearchCV(svc_cv_uni,
                               parameters,
                               cv = 4)
svc_cv_bi_grid = GridSearchCV(svc_cv_bi,
                              parameters,
                              cv = 4)
svc_tfidf_uni_grid = GridSearchCV(svc_tfidf_uni,
                                  parameters,
                                  cv = 4)
svc_tfidf_bi_grid = GridSearchCV(svc_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train SVM
svc_cv_uni_grid.fit(X_train, y_train)
svc_cv_bi_grid.fit(X_train, y_train)
svc_tfidf_uni_grid.fit(X_train, y_train)
svc_tfidf_bi_grid.fit(X_train, y_train)

# Best train scores
svc_cv_uni_grid_score = svc_cv_uni_grid.best_score_
svc_cv_bi_grid_score = svc_cv_bi_grid.best_score_
svc_tfidf_uni_grid_score = svc_tfidf_uni_grid.best_score_
svc_tfidf_bi_grid_score = svc_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('svc_cv_uni:', svc_cv_uni_grid_score)
print('svc_cv_bi:', svc_cv_bi_grid_score)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_score)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_score)

# svc best params
svc_cv_uni_grid_params = svc_cv_uni_grid.best_params_
svc_cv_bi_grid_params = svc_cv_bi_grid.best_params_
svc_tfidf_uni_grid_params = svc_tfidf_uni_grid.best_params_
svc_tfidf_bi_grid_params = svc_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('svc_cv_uni:', svc_cv_uni_grid_params)
print('svc_cv_bi:', svc_cv_bi_grid_params)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_params)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_params)

# Run model on test
test_preds_cv_uni = svc_cv_uni_grid.predict(X_test)
test_preds_cv_bi = svc_cv_bi_grid.predict(X_test)
test_preds_tfidf_uni = svc_tfidf_uni_grid.predict(X_test)
test_preds_tfidf_bi = svc_tfidf_bi_grid.predict(X_test)

# svc best test scores
svc_cv_uni_grid_score_test = svc_cv_uni_grid.score(X_test, y_test)
svc_cv_bi_grid_score_test = svc_cv_bi_grid.score(X_test, y_test)
svc_tfidf_uni_grid_score_test = svc_tfidf_uni_grid.score(X_test, y_test)
svc_tfidf_bi_grid_score_test = svc_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('svc_cv_uni:', svc_cv_uni_grid_score_test)
print('svc_cv_bi:', svc_cv_bi_grid_score_test)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_score_test)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_score_test)

#Calculate metrics
test_accuracy_cv_uni = accuracy_score(y_test, test_preds_cv_uni)
test_accuracy_cv_bi = accuracy_score(y_test, test_preds_cv_bi)
test_accuracy_tfidf_uni = accuracy_score(y_test, test_preds_tfidf_uni)
test_accuracy_tfidf_bi = accuracy_score(y_test, test_preds_tfidf_bi)
test_prec_cv_uni = precision_score(y_test, test_preds_cv_uni)
test_prec_cv_bi = precision_score(y_test, test_preds_cv_bi)
test_prec_tfidf_uni = precision_score(y_test, test_preds_tfidf_uni)
test_prec_tfidf_bi = precision_score(y_test, test_preds_tfidf_bi)
test_recall_cv_uni = recall_score(y_test, test_preds_cv_uni)
test_recall_cv_bi = recall_score(y_test, test_preds_cv_bi)
test_recall_tfidf_uni = recall_score(y_test, test_preds_tfidf_uni)
test_recall_tfidf_bi = recall_score(y_test, test_preds_tfidf_bi)

#Print metrics
print("test_accuracy_cv_uni: ", test_accuracy_cv_uni)
print("test_accuracy_cv_bi: ", test_accuracy_cv_bi)
print("test_accuracy_tfidf_uni: ", test_accuracy_tfidf_uni)
print("test_accuracy_tfidf_bi: ", test_accuracy_tfidf_bi)
print("test_precision_cv_uni: ", test_prec_cv_uni)
print("test_precision_cv_bi: ", test_prec_cv_bi)
print("test_precision_tfidf_uni: ", test_prec_tfidf_uni)
print("test_precision_tfidf_bi: ", test_prec_tfidf_bi)
print("test_recall_cv_uni: ", test_recall_cv_uni)
print("test_recall_cv_bi: ", test_recall_cv_bi)
print("test_recall_tfidf_uni: ", test_recall_tfidf_uni)
print("test_recall_tfidf_bi: ", test_recall_tfidf_bi)

Best GridSearch Scores
svc_cv_uni: 0.8857758620689655
svc_cv_bi: 0.9008620689655171
svc_tfidf_uni: 0.8900862068965518
svc_tfidf_bi: 0.8857758620689655
Best GridSearch Params
svc_cv_uni: {'svc__C': 0.01, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
svc_cv_bi: {'svc__C': 0.01, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
svc_tfidf_uni: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'}
svc_tfidf_bi: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Test Scores
svc_cv_uni: 0.8888888888888888
svc_cv_bi: 0.9316239316239316
svc_tfidf_uni: 0.9658119658119658
svc_tfidf_bi: 0.9487179487179487
test_accuracy_cv_uni:  0.8888888888888888
test_accuracy_cv_bi:  0.9316239316239316
test_accuracy_tfidf_uni:  0.9658119658119658
test_accuracy_tfidf_bi:  0.9487179487179487
test_precision_cv_uni:  0.8245614035087719
test_precision_cv_bi:  0.9038461538461539
test_precision_tfidf_uni:  0.9423076923076923
test_precision_tfidf_bi:  0.9074074074074074
test_recall_cv_uni:  0.94
test_re

### 2. Predicting Root Canal Coverage

In [43]:
X_train = train_df['text_cleaned']
y_train = train_df['Root Canal Coverage (Y/N)']

X_test = test_df['text_cleaned']
y_test = test_df['Root Canal Coverage (Y/N)']

In [44]:
# Setup SVM using CountVectorizer, TFIDF, unigrams, bigrams, & different hyperparameters

#parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
#              'mnb__fit_prior': [True, False]}

#

parameters = {
    'svc__C': [0.01, 0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf','sigmoid'],
    'svc__gamma': ['scale', 'auto']
}

svc_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('svc', SVC())])
svc_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('svc', SVC())])
svc_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('svc', SVC())])
svc_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('svc', SVC())])

svc_cv_uni_grid = GridSearchCV(svc_cv_uni,
                               parameters,
                               cv = 4)
svc_cv_bi_grid = GridSearchCV(svc_cv_bi,
                              parameters,
                              cv = 4)
svc_tfidf_uni_grid = GridSearchCV(svc_tfidf_uni,
                                  parameters,
                                  cv = 4)
svc_tfidf_bi_grid = GridSearchCV(svc_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train SVM
svc_cv_uni_grid.fit(X_train, y_train)
svc_cv_bi_grid.fit(X_train, y_train)
svc_tfidf_uni_grid.fit(X_train, y_train)
svc_tfidf_bi_grid.fit(X_train, y_train)

# Best train scores
svc_cv_uni_grid_score = svc_cv_uni_grid.best_score_
svc_cv_bi_grid_score = svc_cv_bi_grid.best_score_
svc_tfidf_uni_grid_score = svc_tfidf_uni_grid.best_score_
svc_tfidf_bi_grid_score = svc_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('svc_cv_uni:', svc_cv_uni_grid_score)
print('svc_cv_bi:', svc_cv_bi_grid_score)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_score)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_score)

# svc best params
svc_cv_uni_grid_params = svc_cv_uni_grid.best_params_
svc_cv_bi_grid_params = svc_cv_bi_grid.best_params_
svc_tfidf_uni_grid_params = svc_tfidf_uni_grid.best_params_
svc_tfidf_bi_grid_params = svc_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('svc_cv_uni:', svc_cv_uni_grid_params)
print('svc_cv_bi:', svc_cv_bi_grid_params)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_params)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_params)

# Run model on test
test_preds_cv_uni = svc_cv_uni_grid.predict(X_test)
test_preds_cv_bi = svc_cv_bi_grid.predict(X_test)
test_preds_tfidf_uni = svc_tfidf_uni_grid.predict(X_test)
test_preds_tfidf_bi = svc_tfidf_bi_grid.predict(X_test)

# svc best test scores
svc_cv_uni_grid_score_test = svc_cv_uni_grid.score(X_test, y_test)
svc_cv_bi_grid_score_test = svc_cv_bi_grid.score(X_test, y_test)
svc_tfidf_uni_grid_score_test = svc_tfidf_uni_grid.score(X_test, y_test)
svc_tfidf_bi_grid_score_test = svc_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('svc_cv_uni:', svc_cv_uni_grid_score_test)
print('svc_cv_bi:', svc_cv_bi_grid_score_test)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_score_test)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_score_test)

#Calculate metrics
test_accuracy_cv_uni = accuracy_score(y_test, test_preds_cv_uni)
test_accuracy_cv_bi = accuracy_score(y_test, test_preds_cv_bi)
test_accuracy_tfidf_uni = accuracy_score(y_test, test_preds_tfidf_uni)
test_accuracy_tfidf_bi = accuracy_score(y_test, test_preds_tfidf_bi)
test_prec_cv_uni = precision_score(y_test, test_preds_cv_uni)
test_prec_cv_bi = precision_score(y_test, test_preds_cv_bi)
test_prec_tfidf_uni = precision_score(y_test, test_preds_tfidf_uni)
test_prec_tfidf_bi = precision_score(y_test, test_preds_tfidf_bi)
test_recall_cv_uni = recall_score(y_test, test_preds_cv_uni)
test_recall_cv_bi = recall_score(y_test, test_preds_cv_bi)
test_recall_tfidf_uni = recall_score(y_test, test_preds_tfidf_uni)
test_recall_tfidf_bi = recall_score(y_test, test_preds_tfidf_bi)

#Print metrics
print("test_accuracy_cv_uni: ", test_accuracy_cv_uni)
print("test_accuracy_cv_bi: ", test_accuracy_cv_bi)
print("test_accuracy_tfidf_uni: ", test_accuracy_tfidf_uni)
print("test_accuracy_tfidf_bi: ", test_accuracy_tfidf_bi)
print("test_precision_cv_uni: ", test_prec_cv_uni)
print("test_precision_cv_bi: ", test_prec_cv_bi)
print("test_precision_tfidf_uni: ", test_prec_tfidf_uni)
print("test_precision_tfidf_bi: ", test_prec_tfidf_bi)
print("test_recall_cv_uni: ", test_recall_cv_uni)
print("test_recall_cv_bi: ", test_recall_cv_bi)
print("test_recall_tfidf_uni: ", test_recall_tfidf_uni)
print("test_recall_tfidf_bi: ", test_recall_tfidf_bi)

Best GridSearch Scores
svc_cv_uni: 0.859375
svc_cv_bi: 0.859375
svc_tfidf_uni: 0.8515625
svc_tfidf_bi: 0.8828125
Best GridSearch Params
svc_cv_uni: {'svc__C': 1, 'svc__gamma': 'auto', 'svc__kernel': 'rbf'}
svc_cv_bi: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'sigmoid'}
svc_tfidf_uni: {'svc__C': 0.01, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
svc_tfidf_bi: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'sigmoid'}
Test Scores
svc_cv_uni: 0.9393939393939394
svc_cv_bi: 0.9393939393939394
svc_tfidf_uni: 0.9393939393939394
svc_tfidf_bi: 0.9696969696969697
test_accuracy_cv_uni:  0.9393939393939394
test_accuracy_cv_bi:  0.9393939393939394
test_accuracy_tfidf_uni:  0.9393939393939394
test_accuracy_tfidf_bi:  0.9696969696969697
test_precision_cv_uni:  0.9393939393939394
test_precision_cv_bi:  0.967741935483871
test_precision_tfidf_uni:  0.9393939393939394
test_precision_tfidf_bi:  0.96875
test_recall_cv_uni:  1.0
test_recall_cv_bi:  0.967741935483871
test_recall_tfidf_uni: 

### 3. Predicting Healthy Food Rollover

In [45]:
X_train = train_df['text_cleaned']
y_train = train_df['Healthy Food Rollover']

X_test = test_df['text_cleaned']
y_test = test_df['Healthy Food Rollover']

In [46]:
# Setup SVM using CountVectorizer, TFIDF, unigrams, bigrams, & different hyperparameters

#parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
#              'mnb__fit_prior': [True, False]}

#

parameters = {
    'svc__C': [0.01, 0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf','sigmoid'],
    'svc__gamma': ['scale', 'auto']
}

svc_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('svc', SVC())])
svc_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('svc', SVC())])
svc_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('svc', SVC())])
svc_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('svc', SVC())])

svc_cv_uni_grid = GridSearchCV(svc_cv_uni,
                               parameters,
                               cv = 4)
svc_cv_bi_grid = GridSearchCV(svc_cv_bi,
                              parameters,
                              cv = 4)
svc_tfidf_uni_grid = GridSearchCV(svc_tfidf_uni,
                                  parameters,
                                  cv = 4)
svc_tfidf_bi_grid = GridSearchCV(svc_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train SVM
svc_cv_uni_grid.fit(X_train, y_train)
svc_cv_bi_grid.fit(X_train, y_train)
svc_tfidf_uni_grid.fit(X_train, y_train)
svc_tfidf_bi_grid.fit(X_train, y_train)

# Best train scores
svc_cv_uni_grid_score = svc_cv_uni_grid.best_score_
svc_cv_bi_grid_score = svc_cv_bi_grid.best_score_
svc_tfidf_uni_grid_score = svc_tfidf_uni_grid.best_score_
svc_tfidf_bi_grid_score = svc_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('svc_cv_uni:', svc_cv_uni_grid_score)
print('svc_cv_bi:', svc_cv_bi_grid_score)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_score)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_score)

# svc best params
svc_cv_uni_grid_params = svc_cv_uni_grid.best_params_
svc_cv_bi_grid_params = svc_cv_bi_grid.best_params_
svc_tfidf_uni_grid_params = svc_tfidf_uni_grid.best_params_
svc_tfidf_bi_grid_params = svc_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('svc_cv_uni:', svc_cv_uni_grid_params)
print('svc_cv_bi:', svc_cv_bi_grid_params)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_params)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_params)

# Run model on test
test_preds_cv_uni = svc_cv_uni_grid.predict(X_test)
test_preds_cv_bi = svc_cv_bi_grid.predict(X_test)
test_preds_tfidf_uni = svc_tfidf_uni_grid.predict(X_test)
test_preds_tfidf_bi = svc_tfidf_bi_grid.predict(X_test)

# svc best test scores
svc_cv_uni_grid_score_test = svc_cv_uni_grid.score(X_test, y_test)
svc_cv_bi_grid_score_test = svc_cv_bi_grid.score(X_test, y_test)
svc_tfidf_uni_grid_score_test = svc_tfidf_uni_grid.score(X_test, y_test)
svc_tfidf_bi_grid_score_test = svc_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('svc_cv_uni:', svc_cv_uni_grid_score_test)
print('svc_cv_bi:', svc_cv_bi_grid_score_test)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_score_test)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_score_test)

#Calculate metrics
test_accuracy_cv_uni = accuracy_score(y_test, test_preds_cv_uni)
test_accuracy_cv_bi = accuracy_score(y_test, test_preds_cv_bi)
test_accuracy_tfidf_uni = accuracy_score(y_test, test_preds_tfidf_uni)
test_accuracy_tfidf_bi = accuracy_score(y_test, test_preds_tfidf_bi)
test_prec_cv_uni = precision_score(y_test, test_preds_cv_uni)
test_prec_cv_bi = precision_score(y_test, test_preds_cv_bi)
test_prec_tfidf_uni = precision_score(y_test, test_preds_tfidf_uni)
test_prec_tfidf_bi = precision_score(y_test, test_preds_tfidf_bi)
test_recall_cv_uni = recall_score(y_test, test_preds_cv_uni)
test_recall_cv_bi = recall_score(y_test, test_preds_cv_bi)
test_recall_tfidf_uni = recall_score(y_test, test_preds_tfidf_uni)
test_recall_tfidf_bi = recall_score(y_test, test_preds_tfidf_bi)

#Print metrics
print("test_accuracy_cv_uni: ", test_accuracy_cv_uni)
print("test_accuracy_cv_bi: ", test_accuracy_cv_bi)
print("test_accuracy_tfidf_uni: ", test_accuracy_tfidf_uni)
print("test_accuracy_tfidf_bi: ", test_accuracy_tfidf_bi)
print("test_precision_cv_uni: ", test_prec_cv_uni)
print("test_precision_cv_bi: ", test_prec_cv_bi)
print("test_precision_tfidf_uni: ", test_prec_tfidf_uni)
print("test_precision_tfidf_bi: ", test_prec_tfidf_bi)
print("test_recall_cv_uni: ", test_recall_cv_uni)
print("test_recall_cv_bi: ", test_recall_cv_bi)
print("test_recall_tfidf_uni: ", test_recall_tfidf_uni)
print("test_recall_tfidf_bi: ", test_recall_tfidf_bi)

Best GridSearch Scores
svc_cv_uni: 1.0
svc_cv_bi: 1.0
svc_tfidf_uni: 1.0
svc_tfidf_bi: 1.0
Best GridSearch Params
svc_cv_uni: {'svc__C': 0.01, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
svc_cv_bi: {'svc__C': 0.01, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
svc_tfidf_uni: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
svc_tfidf_bi: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Test Scores
svc_cv_uni: 0.9696969696969697
svc_cv_bi: 0.9696969696969697
svc_tfidf_uni: 0.9696969696969697
svc_tfidf_bi: 0.9696969696969697
test_accuracy_cv_uni:  0.9696969696969697
test_accuracy_cv_bi:  0.9696969696969697
test_accuracy_tfidf_uni:  0.9696969696969697
test_accuracy_tfidf_bi:  0.9696969696969697
test_precision_cv_uni:  0.6666666666666666
test_precision_cv_bi:  0.6666666666666666
test_precision_tfidf_uni:  0.6666666666666666
test_precision_tfidf_bi:  0.6666666666666666
test_recall_cv_uni:  1.0
test_recall_cv_bi:  1.0
test_recall_tfidf_uni:  1.0
test_recall_t

### 4. Predicting OTC Benefit Rollover

In [47]:
X_train = train_df['text_cleaned']
y_train = train_df['OTC Rollover (Y/N)']

X_test = test_df['text_cleaned']
y_test = test_df['OTC Rollover (Y/N)']

In [48]:
# Setup SVM using CountVectorizer, TFIDF, unigrams, bigrams, & different hyperparameters

#parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
#              'mnb__fit_prior': [True, False]}

#

parameters = {
    'svc__C': [0.01, 0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf','sigmoid'],
    'svc__gamma': ['scale', 'auto']
}

svc_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('svc', SVC())])
svc_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('svc', SVC())])
svc_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('svc', SVC())])
svc_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('svc', SVC())])

svc_cv_uni_grid = GridSearchCV(svc_cv_uni,
                               parameters,
                               cv = 4)
svc_cv_bi_grid = GridSearchCV(svc_cv_bi,
                              parameters,
                              cv = 4)
svc_tfidf_uni_grid = GridSearchCV(svc_tfidf_uni,
                                  parameters,
                                  cv = 4)
svc_tfidf_bi_grid = GridSearchCV(svc_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train SVM
svc_cv_uni_grid.fit(X_train, y_train)
svc_cv_bi_grid.fit(X_train, y_train)
svc_tfidf_uni_grid.fit(X_train, y_train)
svc_tfidf_bi_grid.fit(X_train, y_train)

# Best train scores
svc_cv_uni_grid_score = svc_cv_uni_grid.best_score_
svc_cv_bi_grid_score = svc_cv_bi_grid.best_score_
svc_tfidf_uni_grid_score = svc_tfidf_uni_grid.best_score_
svc_tfidf_bi_grid_score = svc_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('svc_cv_uni:', svc_cv_uni_grid_score)
print('svc_cv_bi:', svc_cv_bi_grid_score)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_score)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_score)

# svc best params
svc_cv_uni_grid_params = svc_cv_uni_grid.best_params_
svc_cv_bi_grid_params = svc_cv_bi_grid.best_params_
svc_tfidf_uni_grid_params = svc_tfidf_uni_grid.best_params_
svc_tfidf_bi_grid_params = svc_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('svc_cv_uni:', svc_cv_uni_grid_params)
print('svc_cv_bi:', svc_cv_bi_grid_params)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_params)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_params)

# Run model on test
test_preds_cv_uni = svc_cv_uni_grid.predict(X_test)
test_preds_cv_bi = svc_cv_bi_grid.predict(X_test)
test_preds_tfidf_uni = svc_tfidf_uni_grid.predict(X_test)
test_preds_tfidf_bi = svc_tfidf_bi_grid.predict(X_test)

# svc best test scores
svc_cv_uni_grid_score_test = svc_cv_uni_grid.score(X_test, y_test)
svc_cv_bi_grid_score_test = svc_cv_bi_grid.score(X_test, y_test)
svc_tfidf_uni_grid_score_test = svc_tfidf_uni_grid.score(X_test, y_test)
svc_tfidf_bi_grid_score_test = svc_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('svc_cv_uni:', svc_cv_uni_grid_score_test)
print('svc_cv_bi:', svc_cv_bi_grid_score_test)
print('svc_tfidf_uni:', svc_tfidf_uni_grid_score_test)
print('svc_tfidf_bi:', svc_tfidf_bi_grid_score_test)

#Calculate metrics
test_accuracy_cv_uni = accuracy_score(y_test, test_preds_cv_uni)
test_accuracy_cv_bi = accuracy_score(y_test, test_preds_cv_bi)
test_accuracy_tfidf_uni = accuracy_score(y_test, test_preds_tfidf_uni)
test_accuracy_tfidf_bi = accuracy_score(y_test, test_preds_tfidf_bi)
test_prec_cv_uni = precision_score(y_test, test_preds_cv_uni)
test_prec_cv_bi = precision_score(y_test, test_preds_cv_bi)
test_prec_tfidf_uni = precision_score(y_test, test_preds_tfidf_uni)
test_prec_tfidf_bi = precision_score(y_test, test_preds_tfidf_bi)
test_recall_cv_uni = recall_score(y_test, test_preds_cv_uni)
test_recall_cv_bi = recall_score(y_test, test_preds_cv_bi)
test_recall_tfidf_uni = recall_score(y_test, test_preds_tfidf_uni)
test_recall_tfidf_bi = recall_score(y_test, test_preds_tfidf_bi)

#Print metrics
print("test_accuracy_cv_uni: ", test_accuracy_cv_uni)
print("test_accuracy_cv_bi: ", test_accuracy_cv_bi)
print("test_accuracy_tfidf_uni: ", test_accuracy_tfidf_uni)
print("test_accuracy_tfidf_bi: ", test_accuracy_tfidf_bi)
print("test_precision_cv_uni: ", test_prec_cv_uni)
print("test_precision_cv_bi: ", test_prec_cv_bi)
print("test_precision_tfidf_uni: ", test_prec_tfidf_uni)
print("test_precision_tfidf_bi: ", test_prec_tfidf_bi)
print("test_recall_cv_uni: ", test_recall_cv_uni)
print("test_recall_cv_bi: ", test_recall_cv_bi)
print("test_recall_tfidf_uni: ", test_recall_tfidf_uni)
print("test_recall_tfidf_bi: ", test_recall_tfidf_bi)

Best GridSearch Scores
svc_cv_uni: 0.9453125
svc_cv_bi: 0.953125
svc_tfidf_uni: 0.9375
svc_tfidf_bi: 0.953125
Best GridSearch Params
svc_cv_uni: {'svc__C': 0.01, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
svc_cv_bi: {'svc__C': 0.01, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
svc_tfidf_uni: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
svc_tfidf_bi: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Test Scores
svc_cv_uni: 0.9393939393939394
svc_cv_bi: 0.9393939393939394
svc_tfidf_uni: 0.9696969696969697
svc_tfidf_bi: 0.9393939393939394
test_accuracy_cv_uni:  0.9393939393939394
test_accuracy_cv_bi:  0.9393939393939394
test_accuracy_tfidf_uni:  0.9696969696969697
test_accuracy_tfidf_bi:  0.9393939393939394
test_precision_cv_uni:  0.3333333333333333
test_precision_cv_bi:  0.3333333333333333
test_precision_tfidf_uni:  0.5
test_precision_tfidf_bi:  0.3333333333333333
test_recall_cv_uni:  1.0
test_recall_cv_bi:  1.0
test_recall_tfidf_uni:  1.0
test_reca