# Changes from v16
* Combining ICD and CPT models, but still providing options to run everything quickly

# Next Steps
* Try some different models - supervised & unsupervised
* Hyperparameter tuning

# Notes

## Questions
* Run the data for each category with only no filter, one, and three CPT codes for each discharge summary
    * ANSWER: It performs better with only 1 CPT per note and 3 had a decrease of 3% in accuracy
* Does including all notes/CPT sections improve accuracy?
    * ANSWER: No, it decreased the accuracy
* Does including some sections improve accuracy when included with discharge summary?
    * No, accuracy went from 80% to 71%- at least for the E/M category
* Does accuracy improve when there are more notes per CPT code?
    * Yes
* What is the lowest threshold I can use without decreasing accuracy?
    * It seems like I don't need a threshold
* Does imbalance correction improve model accuracy?
    * Yes, it makes a huge difference
* Is undersampling or over-sampling a better method for imbalance correction?
    * Oversampling since the lowest records only contain one note - could also try SMOTE and see if that gives better results or not
* Use label encoder for the CPT codes
    * Does the accuracy improve when using labelencoder?
* Is limiting CPT codes to just one excluding CPT codes?
    * No
* Can I use the descriptions in the CPT table to help improve my analysis?
    * Yes, only for 94002 and 94003
* Do a greater variety of CPT scores improve CPT f-scores?
    * No, they slightly change, but probably only b/c what goes in train and test changes
* Does the accuracy improve when filtered to each CPT section individually?
    * Yes



# Next Steps

* Adjust model so it can be run in smaller chunks
* Check the accuracy for those CPT codes in v13 and see if accuracy decreased when looking at individual sections
    * Add option to run the code without stratifying by section
* Complete model to predict CPT and ICD codes with their probabilities
* Serialize the model to be used in streamlit
* Add loading statements with time it module times for how long each section takes to run
* Add HCC code that suggests HCC's based on the output
* Add descriptions for the ICD and CPT code predictions
* Add top most predictive feature names for the model in the output
* Use label encoder and then reverse transform the encoded values

Extra:
* Add K-Means clustering to add suggested codes based on CPT and ICD code output
* Add the model to a class with functions as methods

# Import Packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import gzip
import pandas as pd
import glob
import string
from sklearn.utils import resample
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


# Import the MIMIC data

In [2]:
dataset_dictionary = {}

for file_path in glob.glob('.\\Data\\MIMIC Files\*'):
    file_name = file_path.split('\\')[3].split('.')[0]
    with gzip.open(file_path, mode='r') as file:
        dataset_dictionary[file_name] = pd.read_csv(file)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Assign Datatypes

In [3]:
# Check all the datasets exist in the dictionary 
print(dataset_dictionary.keys())

# Check the datatypes and information for each table 
for i in dataset_dictionary.keys():
    print(dataset_dictionary[i].info())

# Correct any datatype issues #####

# CPTEVENTS
dataset_dictionary['CPTEVENTS'].loc[:,['SECTIONHEADER','CPT_CD']] = dataset_dictionary['CPTEVENTS'].loc[:,['SECTIONHEADER','CPT_CD']].astype(str)
dataset_dictionary['CPTEVENTS']['CHARTDATE'] = dataset_dictionary['CPTEVENTS']['CHARTDATE'].to_datetime()

dict_keys(['CPTEVENTS', 'DIAGNOSES_ICD', 'D_CPT', 'D_ICD_DIAGNOSES', 'D_ICD_PROCEDURES', 'NOTEEVENTS', 'PATIENTS', 'PROCEDURES_ICD'])
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573146 entries, 0 to 573145
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ROW_ID            573146 non-null  int64  
 1   SUBJECT_ID        573146 non-null  int64  
 2   HADM_ID           573146 non-null  int64  
 3   COSTCENTER        573146 non-null  object 
 4   CHARTDATE         101545 non-null  object 
 5   CPT_CD            573146 non-null  object 
 6   CPT_NUMBER        573128 non-null  float64
 7   CPT_SUFFIX        22 non-null      object 
 8   TICKET_ID_SEQ     471601 non-null  float64
 9   SECTIONHEADER     573125 non-null  object 
 10  SUBSECTIONHEADER  573125 non-null  object 
 11  DESCRIPTION       101545 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 52.5+ MB
None
<class 'pandas.

AttributeError: 'Series' object has no attribute 'to_datetime'

# Join the tables

In [4]:
def join_tables(dataset_dictionary, category=['Discharge summary'], all_notes=False):

    # Define tables
    note_events_base = dataset_dictionary['NOTEEVENTS']
    cpt_events_base = dataset_dictionary['CPTEVENTS']
    icd_events_base = dataset_dictionary['DIAGNOSES_ICD']

    # Combine text for each subject and encounter
    if all_notes == False:
        note_events_base = note_events_base[note_events_base.loc[:,'CATEGORY'].isin(category)]
    
    # Filter out the addendums and restrict notes only to reports
    note_events_base = note_events_base[note_events_base['DESCRIPTION'] == 'Report']
    
    # Aggregate text by Subject and HADM ID's
    note_events = note_events_base.groupby(['SUBJECT_ID', 'HADM_ID'], as_index=False)['TEXT'].agg(sum)
    
    # Create CPT table
    cpt_events_base = cpt_events_base.loc[:, ['SUBJECT_ID','HADM_ID', 'CPT_CD', 'SECTIONHEADER', 'DESCRIPTION']]
    cpt_events = cpt_events_base.drop_duplicates()
    
    # Create ICD table
    icd_events_base = icd_events_base[icd_events_base['SEQ_NUM'] == 1]
    icd_events_base = icd_events_base.loc[:, ['SUBJECT_ID','HADM_ID', 'ICD9_CODE']]
    icd_events = icd_events_base.drop_duplicates()
    
    # Join the datasets
    note_cpt = note_events.merge(cpt_events, on = ['SUBJECT_ID','HADM_ID'])
    note_icd = note_events.merge(icd_events, on = ['SUBJECT_ID', 'HADM_ID'])
    
    # Replace any nulls with blanks
    x = note_cpt[note_cpt['DESCRIPTION'].isnull()].copy()
    x.loc[:,'DESCRIPTION'] = ''
    y = note_cpt[note_cpt['DESCRIPTION'].notnull()].copy()
    note_cpt = pd.concat([x,y])
    
    # Combine description and text columns
    note_cpt['TEXT'] = note_cpt['TEXT'] + note_cpt['DESCRIPTION']
    note_cpt = note_cpt.drop('DESCRIPTION', axis=1)
    
    return note_cpt, note_icd

# Run the function
note_cpt, note_icd = join_tables(dataset_dictionary)

# Drop notes with the nan sectionheader
drop_ls = note_cpt[note_cpt['SECTIONHEADER'] == 'nan']
note_cpt = note_cpt.drop(drop_ls.index)

# Filter the data - CPT

In [5]:
def filter_df(combined_df, threshold):

    # Print value counts original
    print('Value Counts for the original data:\n\n', combined_df['CPT_CD'].value_counts().head(25))

    # Filter based on count limit
    df = combined_df['CPT_CD'].value_counts()
    filtered_ls = list((df[df >= threshold]).index.values)
    filtered_df = combined_df[combined_df['CPT_CD'].isin(filtered_ls)]
    
    # Print value counts filtered
    print('Value Counts for the filtered data:\n\n', combined_df['CPT_CD'].value_counts().head(25))

    return filtered_df

# Find Counts of CPT Codes per Patient Encounter and filter df
def cpt_count_filter(df, og_df, cpt_section_hadm_limit, cpt_hadm_limit):
    
    # Filter based on limit per SECTIONHEADER & merge
    df1 = og_df.groupby(['HADM_ID', 'SECTIONHEADER'])['CPT_CD'].count()
    filtered_encntrs = df1[df1 <= cpt_section_hadm_limit]
    final_df = df.merge(filtered_encntrs, on=['HADM_ID', 'SECTIONHEADER'])
    final_df.drop('CPT_CD_y', axis=1, inplace=True)
    
    # Filter dataset again based on total number of CPT codes per HADM
    df2 = og_df.groupby(['HADM_ID'])['CPT_CD'].count()
    filtered_encntrs = df2[df2 <= cpt_hadm_limit]
    final_df = final_df.merge(filtered_encntrs, on=['HADM_ID'])
    final_df.drop('CPT_CD', axis=1, inplace=True)
    
    # Rename columns
    final_df.columns = ['SUBJECT_ID', 'HADM_ID', 'TEXT', 'CPT_CD', 'SECTIONHEADER']
    
    print('Value Counts for the filtered data:\n\n', final_df['CPT_CD'].value_counts().head(50))

    return final_df
    
# Filter DataFrame to a set amount of CPT codes in each section header #####
def cpt_per_section_filter(df, section_limit, sections=['Emerging technology'], all_sections=False):
    
    # Create list for dataframes
    df_ls = []

    # Group by and count the number of CPT codes
    cts_by_cpt = df.groupby(['SECTIONHEADER', 'CPT_CD'])['CPT_CD'].count()
    cts_by_cpt.index.names = ['SECTIONHEADER', 'CPT_CDS']
    cts_by_cpt = cts_by_cpt.reset_index()
    cts_by_cpt.columns = ['SECTIONHEADER', 'CPT_CD', 'COUNT']

    # Sort values by section and CPT code count
    cts_by_cpt_s = cts_by_cpt.sort_values(by=['SECTIONHEADER','COUNT'], ascending=False)

    # Filter based on the limit of CPT codes wanted for each category
    if all_sections == True:
        sections = list(set(df['SECTIONHEADER']))
        
    for i in sections:
        top_cts = cts_by_cpt_s[cts_by_cpt_s['SECTIONHEADER'] == i].iloc[:section_limit,:]

        # Append to list
        df_ls.append(top_cts)

    # Combine DataFrames
    df_combo = pd.concat(df_ls)

    # Join back to source data
    final_df = df.merge(df_combo, on=['SECTIONHEADER','CPT_CD'])
    
    print('\nThe length of the initial dataset was {} and the new dataset is {}\n\n'.format(len(df), len(final_df)))

    return final_df

def show_section_counts(df):
    # Print count of CPT codes by section
    cts_by_cpt = df.groupby(['SECTIONHEADER', 'CPT_CD'])['CPT_CD'].count()
    cts_by_section = cts_by_cpt.groupby('SECTIONHEADER').count()
    print('\nHere are the counts by section:\n\n', cts_by_section)

def show_cpt_counts_by_section(df):
    cts_by_cpt = df.groupby(['SECTIONHEADER', 'CPT_CD'])['CPT_CD'].count()
    print('\nHere are the counts by CPT by section\n\n:')
#     print(pd.DataFrame(cts_by_cpt.rename('Count')).reset_index())
    for i, x in pd.DataFrame(cts_by_cpt.rename('Count')).reset_index().iterrows():
        print(x['SECTIONHEADER'], x['CPT_CD'], x['Count'])
        
def filter_cpt_ct(df, section, ct):
    df = df[df['SECTIONHEADER'] == section]
    top_codes = list(df['CPT_CD'].value_counts().head(ct).index)
    df = df[df.CPT_CD.isin(top_codes)]
    return df

In [54]:
# Filter the number of CPT occurrences by section and CPT
filtered_df1 = cpt_count_filter(note_cpt, note_cpt, 2, 10)

# Show available sections
print('Available sections:\n\n', set(filtered_df1['SECTIONHEADER']))

# Filter to a set amount of CPT codes for each section header - only here to make the code run faster when testing
filtered_df2 = cpt_per_section_filter(filtered_df1, 10, all_sections=True)

# Filter to those CPT codes that have at least 100 notes or more
filtered_df_cpt = filter_df(filtered_df2, 100)

# Filter total number of CPT codes in the dataset by section
# filtered_df_cpt = filter_cpt_ct(filtered_df_cpt, 'Evaluation and management', 3)

# Show some dataset stats
show_section_counts(filtered_df)
show_cpt_counts_by_section(filtered_df)

Value Counts for the filtered data:

 94003    12871
94002     8206
99291     4758
99232     2385
99233     1924
36556     1788
99254     1170
99231     1078
99223     1035
90935     1027
99222      970
99253      865
99255      700
36620      661
99238      485
31624      469
76942      447
33405      436
76937      352
31645      342
99252      334
31622      315
99292      313
99239      312
99221      268
90801      238
62270      225
31500      194
01996      178
99024      169
90945      154
90937      150
36489      145
32002      140
49080      128
61312      124
99251      111
43246      108
33427      107
93503      107
31600      105
33430       93
33533       91
92960       90
47135       89
33860       86
54150       84
32422       83
27245       80
32000       75
Name: CPT_CD, dtype: int64
Available sections:

 {'Surgery', 'Medicine', 'Emerging technology', 'Radiology', 'Evaluation and management', 'Pathology and laboratory', 'Anesthesia'}

The length of the initial datas

NameError: name 'filtered_df' is not defined

# Filter the Data - ICD

In [55]:
def filter_df_icd(df, threshold):

    x = df['ICD9_CODE'].value_counts() > threshold
    y = list(x[x == 1].index)

    z = df[df['ICD9_CODE'].isin(y)]

    return z

def filter_icd_ct(df,ct):
    top_codes = list(df['ICD9_CODE'].value_counts().head(ct).index)
    df = df[df.ICD9_CODE.isin(top_codes)]
    return df

filtered_df_icd = filter_df_icd(note_icd, 100)
# filtered_df_icd = filter_icd_ct(filtered_df_icd, 3)

# Clean the data

In [56]:
def clean_data(text_series, remove_sections):
  
    # Remove topics
    data = text_series.str.lower() # lowercase all letters
    data = data.str.split(r'(\n\n)')
    if remove_sections:
        for row_num, value in enumerate(data):
            text_chunks = [x.split(':', maxsplit=1) for x in value]
            ls = []
            for i, x in enumerate(text_chunks):
                try:
                    if x[0] != 'social history' or x[0] != 'family history' or 'medication' not in x[0]:
                        ls.append(x[1])
                except:
                    continue

            text_series.iloc[row_num] = ' '.join(ls)

        
    # Remove dates and locations
    text_series = text_series.str.replace('\[\*\*(.*?)\*\*\]', ' ', regex=True)
    
    # Replace \n 
    text_series = text_series.str.replace('\\n',' ', regex=True)  
    
    # Replace punctuation
    text_series = text_series.str.replace('[' + string.punctuation + ']', ' ', regex=True)
    
    # Remove all digits
    text_series = text_series.str.replace('\d',' ', regex=True)
    
    # Replace plurals, endings with ing, endings with ed, endings with ly
#     text_series = text_series.str.replace('s(?=\s)', ' ', regex=True)
#     text_series = text_series.str.replace('ing(?=\s)', ' ', regex=True)
#     text_series = text_series.str.replace('ed(?=\s)', ' ', regex=True)
#     text_series = text_series.str.replace('ly(?=\s)', ' ', regex=True)
    
    return text_series

# Update Text Column -----
filtered_df_cpt['TEXT'] = clean_data(filtered_df_cpt['TEXT'], True)
filtered_df_icd['TEXT'] = clean_data(filtered_df_icd['TEXT'], True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_cpt['TEXT'] = clean_data(filtered_df_cpt['TEXT'], True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

# Label Encode the Predictors

In [44]:
le = preprocessing.LabelEncoder()
filtered_df['CPT_CD'] = le.fit_transform(filtered_df['CPT_CD'])

# Check the Counts for Each Section

In [58]:
filtered_df_cpt['CPT_CD'].value_counts()

94003    12871
94002     8206
99291     4758
99232     2385
99233     1924
36556     1788
99254     1170
99231     1078
99223     1035
90935     1027
99222      970
99253      865
99255      700
36620      661
99238      485
31624      469
76942      447
33405      436
76937      352
31645      342
31622      315
90801      238
62270      225
31500      194
01996      178
99024      169
90945      154
90937      150
36489      145
32002      140
93503      107
Name: CPT_CD, dtype: int64

# Select CPT sections to Run

In [59]:
sections = list(set(filtered_df_cpt['SECTIONHEADER']))
print(sections)
sections = ['Evaluation and management', 'Surgery', 'Medicine']
print(sections)

['Surgery', 'Medicine', 'Radiology', 'Evaluation and management', 'Anesthesia']
['Evaluation and management', 'Surgery', 'Medicine']


# Split the Data - CPT

In [60]:
tt_dict = {} 

def split_stratify_df(df, sections, combine_others=False, all_=False):
    
    # Train test split for each section selected
    if all_:
        sections.clear()
    else:
        for i in sections:
            df_x = df[df['SECTIONHEADER'] == i]['TEXT'].values
            df_y = df[df['SECTIONHEADER'] == i]['CPT_CD']
            tt_dict['X_train_' + i], tt_dict['X_test_' + i], tt_dict['y_train_' + i], tt_dict['y_test_' + i], \
            tt_dict['index_train_' + i], tt_dict['index_test_' + i] = \
            train_test_split(df_x, df_y, range(len(df_y)), test_size = .3, random_state = 42, shuffle=True)
        
    # Group other sections not included in selection
    if combine_others:
        i = 'other'
        other_sections = list(set(df['SECTIONHEADER']).difference(set(sections)))
        sections.append(i)
        df_x = df[df['SECTIONHEADER'].isin(other_sections)]['TEXT'].values
        df_y = df[df['SECTIONHEADER'].isin(other_sections)]['CPT_CD']
        tt_dict['X_train_' + i], tt_dict['X_test_' + i], tt_dict['y_train_' + i], tt_dict['y_test_' + i], \
        tt_dict['index_train_' + i], tt_dict['index_test_' + i] = \
        train_test_split(df_x, df_y, range(len(df_y)), test_size = .3, random_state = 42, shuffle=True)
        
        
split_stratify_df(filtered_df_cpt, sections, True)

# Select ICD sections to run

In [61]:
sections.append('icd')
# sections.append('other')
print(sections)

['Evaluation and management', 'Surgery', 'Medicine', 'other', 'icd']


# Split the Data - ICD

In [62]:
tt_dict['X_train_icd'], tt_dict['X_test_icd'] , tt_dict['y_train_icd'], tt_dict['y_test_icd'] = \
train_test_split(filtered_df_icd['TEXT'].values, filtered_df_icd['ICD9_CODE'], test_size = .3, random_state = 42, shuffle=True)

# Balance the data

In [63]:
def oversample_df(X_train, y_train, percentile):
            
    # Recombine the training dataset
    x = pd.Series(X_train).reset_index(drop=True)
    y = pd.Series(y_train).reset_index(drop=True)
    training_df = pd.concat([x,y], axis=1, ignore_index=True)

    # Check counts
    df_cts = training_df.iloc[:,1].value_counts()
    record_ct = round(np.percentile(df_cts, percentile))
    print('New Balanced Record Count per feature: {}'.format(record_ct))
    
    # Create a list of CPT values
    df = list(df_cts.index.values)

    # Resample
    minority_df = []
    for i in df:
        test_resampled = resample(training_df[training_df.iloc[:,1] == i], replace=True, n_samples=record_ct, random_state=123)
        minority_df.append(test_resampled)
    
    # Create final dataframe
    new_df = pd.concat(minority_df)
    
    return new_df

def balance(tt_dict, sections):
    for section in sections:
        print(section)
        
        # Running balance function
        training_balanced = oversample_df(tt_dict['X_train_' + section], tt_dict['y_train_' + section] , 95)
        
        # Reassign balanced data
        tt_dict['X_train_' + section] = np.array(training_balanced.iloc[:,0].values)
        tt_dict['y_train_' + section] = np.array(training_balanced.iloc[:,1].values)

# Run the functions
balance(tt_dict, sections)

Evaluation and management
New Balanced Record Count per feature: 2580
Surgery
New Balanced Record Count per feature: 893
Medicine
New Balanced Record Count per feature: 7842
other
New Balanced Record Count per feature: 308
icd
New Balanced Record Count per feature: 785


# Tokenize the data

In [109]:
# Define stop words
my_stop_words = list(set(stopwords.words('english'))) \
                + ['admission', 'date', 'sex'] \
                + ['needed', 'every', 'seen', 'weeks', 'please', 'ml', 'unit', 'small', 'year', 'old', 'cm', 'non', 'mm', 'however']
                # Got the above from my top 100 most predictive words that I wanted to remove

# Set Dictionary for vectorized words
vectorized_words = {}

# Import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=my_stop_words, max_df=.7, min_df = 2, sublinear_tf = True, ngram_range = (1, 2))
    
def vectorize_df(train_test_dict, tfidf_vectorizer, sections):

    for section in sections:
        
        # Transform the training data
        tfidf_train = tfidf_vectorizer.fit_transform(train_test_dict['X_train_' + section])

        # Transform the test data
        tfidf_test = tfidf_vectorizer.transform(train_test_dict['X_test_' + section])

        # Add results to dictionary

        vectorized_words['tfidf_train_' + section] = tfidf_train
        vectorized_words['tfidf_test_' + section] = tfidf_test
        vectorized_words['tfidf_vectorizer' + section] = tfidf_vectorizer 

vectorize_df(tt_dict, tfidf_vectorizer, ['Surgery'])   


# Run Naive Bayes

In [45]:
models_nb = {}

def run_clf(vectorized_words, train_test_dict, sections):

    # Fit and check accuracy
    for section in sections:
        
        # Use Naive Bayes model
        nb_classifier = MultinomialNB()
        
        # Fit model
        nb_classifier.fit(vectorized_words['tfidf_train_' + section], train_test_dict['y_train_' + section])
        
        # Save in dictionary
        models_nb[section] = nb_classifier
    
run_clf(vectorized_words, tt_dict, ['other'])

# Run Decision Tree

In [46]:
from sklearn.tree import DecisionTreeClassifier

models_tree = {}

def run_clf(vectorized_words, train_test_dict, sections):

    # Fit and check accuracy
    for section in sections:
        
        # Use Naive Bayes model
        clf = DecisionTreeClassifier(random_state=123)
        
        # Fit model
        clf.fit(vectorized_words['tfidf_train_' + section], train_test_dict['y_train_' + section])
        
        # Save in dictionary
        models_tree[section] = clf
    
run_clf(vectorized_words, tt_dict, ['other'])

# Run Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier

models_rf = {}

def run_clf(vectorized_words, train_test_dict, sections):

    # Fit and check accuracy
    for section in sections:
        
        # Use Naive Bayes model
        clf = RandomForestClassifier(random_state=123)
        
        # Fit model
        clf.fit(vectorized_words['tfidf_train_' + section], train_test_dict['y_train_' + section])
        
        # Save in dictionary
        models_rf[section] = clf
    
run_clf(vectorized_words, tt_dict, ['other'])

# Run Gradient Boost

In [48]:
from sklearn.ensemble import GradientBoostingClassifier

models_xg = {}

def run_clf(vectorized_words, train_test_dict, sections):

    # Fit and check accuracy
    for section in sections:
        
        # Use Naive Bayes model
        clf = GradientBoostingClassifier(random_state=123)
        
        # Fit model
        clf.fit(vectorized_words['tfidf_train_' + section], train_test_dict['y_train_' + section])
        
        # Save in dictionary
        models_xg[section] = clf
    
run_clf(vectorized_words, tt_dict, ['other'])

# KNN

In [49]:
from sklearn.neighbors import KNeighborsClassifier

models_knn = {}

def run_clf(vectorized_words, train_test_dict, sections):

    # Fit and check accuracy
    for section in sections:
        
        # Use Naive Bayes model
        clf = KNeighborsClassifier()
        
        # Fit model
        clf.fit(vectorized_words['tfidf_train_' + section], train_test_dict['y_train_' + section])
        
        # Save in dictionary
        models_knn[section] = clf
    
run_clf(vectorized_words, tt_dict, ['other'])

# Logistic Regression

In [91]:
from sklearn.linear_model import LogisticRegression

models_lr = {}

def run_clf(vectorized_words, train_test_dict, sections):

    # Fit and check accuracy
    for section in sections:
        
        # Use Naive Bayes model
        clf = LogisticRegression(random_state=123, C=1, max_iter=25, solver='sag', n_jobs=-1)
        
        # Fit model
        clf.fit(vectorized_words['tfidf_train_' + section], train_test_dict['y_train_' + section])
        
        # Save in dictionary
        models_lr[section] = clf
    
run_clf(vectorized_words, tt_dict, sections)

# Evaluation

In [92]:
# models_ls = [models_nb, models_rf, models_xg, models_knn, models_lr, models_tree]
models_ls = [models_lr]

# Check accuracy
def predictions(tt_dict, models, section=[]):
    for key, model in models.items():
        if len(section) == 0 or key == section:
            pred = model.predict(vectorized_words['tfidf_test_' + key])
            print(key)
            print(metrics.accuracy_score(tt_dict['y_test_' + key], pred))

for model in models_ls:
    print(str(model))
    predictions(tt_dict, model)

{'Evaluation and management': LogisticRegression(C=1, max_iter=25, n_jobs=-1, random_state=123, solver='sag'), 'Surgery': LogisticRegression(C=1, max_iter=25, n_jobs=-1, random_state=123, solver='sag'), 'Medicine': LogisticRegression(C=1, max_iter=25, n_jobs=-1, random_state=123, solver='sag'), 'other': LogisticRegression(C=1, max_iter=25, n_jobs=-1, random_state=123, solver='sag'), 'icd': LogisticRegression(C=1, max_iter=25, n_jobs=-1, random_state=123, solver='sag')}
Evaluation and management
0.21058338755150727
Surgery
0.47491166077738517
Medicine
0.8240511851097863
other
0.7959183673469388
icd
0.6199056199056199


# Classification Report

In [116]:
# Create classification report taken from here: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
print('Test')
section = 'other'
pred = models_ls[0][section].predict(vectorized_words['tfidf_test_' + section])
print(classification_report(tt_dict['y_test_' + section], pred))

# print('Training')
# pred_x = nb_classifier.predict(vectorized_words['tfidf_train'])
# print(classification_report(tt_dict['y_train_' + section], pred_x))

Test


KeyError: 'tfidf_test_other'

# Hyperparameter Tuning - Grid Search - LR

In [82]:
# Define stop words
my_stop_words = list(set(stopwords.words('english'))) \
                + ['admission', 'date', 'sex'] \
                + ['needed', 'every', 'seen', 'weeks', 'please', 'ml', 'unit', 'small', 'year', 'old', 'cm', 'non', 'mm', 'however']
                # Got the above from my top 100 most predictive words that I wanted to remove

# Taken from: https://stackoverflow.com/questions/44066264/how-to-choose-parameters-in-tfidfvectorizer-in-sklearn-during-unsupervised-clust/44080802
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=my_stop_words)),
    ('clf', LogisticRegression(random_state=123)),
])
parameters = {
#     'tfidf__max_df': (.15,.2,.25,.3) # .2 is the best param
#     , 'tfidf__sublinear_tf' : (True, False) # True
#     'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)], # (1,2) is best
#     'clf__alpha': (0.001, .01) # .001 is best
    
#     'tfidf__max_df': (.2,.5,.7) # .7 is the best param
#     , 'tfidf__ngram_range': [(1, 1), (1, 2)] # (1,2) is best
#     , 'tfidf__min_df': (1,2,3) # 2 is the best
#     , 'clf__alpha': (0.001, .01, .1,.5) # .001 is best
    'clf__C': (.8,.9,1) # 1
#     , 'clf__solver': ('liblinear', 'sag', 'saga') # sag
    , 'clf__max_iter': (25,40,50) # 25
   
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=-1, verbose=3, scoring='accuracy')
grid_search_tune.fit(tt_dict['X_train_other'], tt_dict['y_train_other'])

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
Best parameters set:
[('tfidf', TfidfVectorizer(stop_words=['as', 'this', 'during', 'up', 'than', 'what', 'had',
                            'wouldn', 'shan', 'm', 'ours', 'y', 'how', 'you',
                            'of', 'further', 'while', 'if', 'can', 'off',
                            'some', 'whom', 'shouldn', 'any', 'that', 'your',
                            't', 'does', 'after', 'below', ...])), ('clf', LogisticRegression(C=1, max_iter=25, random_state=123))]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Hyperparameter Tuning - Grid Search - NB

In [None]:
# Define stop words
my_stop_words = list(set(stopwords.words('english'))) \
                + ['admission', 'date', 'sex'] \
                + ['needed', 'every', 'seen', 'weeks', 'please', 'ml', 'unit', 'small', 'year', 'old', 'cm', 'non', 'mm', 'however']
                # Got the above from my top 100 most predictive words that I wanted to remove

# Taken from: https://stackoverflow.com/questions/44066264/how-to-choose-parameters-in-tfidfvectorizer-in-sklearn-during-unsupervised-clust/44080802
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=my_stop_words)),
    ('clf', MultinomialNB()),
])
parameters = {
#     'tfidf__max_df': (.15,.2,.25,.3) # .2 is the best param
#     , 'tfidf__sublinear_tf' : (True, False) # True
#     'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)], # (1,2) is best
#     'clf__alpha': (0.001, .01) # .001 is best
    
    'tfidf__max_df': (.2,.5,.7) # .7 is the best param
    , 'tfidf__ngram_range': [(1, 1), (1, 2)] # (1,2) is best
    , 'tfidf__min_df': (1,2,3) # 2 is the best
    , 'clf__alpha': (0.001, .01, .1,.5) # .001 is best
   
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=-1, verbose=3, scoring='accuracy')
grid_search_tune.fit(tt_dict['X_train_other'], tt_dict['y_train_other'])

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

In [81]:
best_result = grid_search_tune.best_score_
print(best_result)

0.9090909090909092


# Make Predictions

In [115]:
def predict_cpt(text):
    input_text_clean = clean_data(pd.Series(text), False)
    tfidf_input_test = tfidf_vectorizer.transform(input_text_clean)
    models_ls[0]['Surgery'].predict(tfidf_input_test)
    print(models_ls[0]['Surgery'].predict(tfidf_input_test), str(round(max(models_ls[0]['Surgery'].predict_proba(tfidf_input_test)[0]) * 100,2)) + '%')
    print(models_ls[0]['Surgery'].predict_proba(tfidf_input_test)[0])
          
text_cpt = ''
predict_cpt(text_cpt)

['36620'] 15.53%
[0.06305929 0.14101228 0.10888526 0.14560006 0.09102712 0.02977054
 0.1164756  0.08330722 0.15525441 0.06560823]


# Look at the Most Predictive Features

In [None]:
# Notes
# sum([np.exp(1)** x for x in nb_classifier.coef_[0]]) # The probability of all the words equals one
# # Taken from here: * https://stackoverflow.com/questions/61586946/how-to-calculate-feature-log-prob-in-the-naive-bayes-multinomialnb

# ------------------------------------------

import numpy as np

def get_feature_rank(tfidf_vectorizer, y_no, nb_classifier):
    
    # Get the feature names
    feature_names = tfidf_vectorizer.get_feature_names()

    # Zip together the first CPT weights with feature names
    feat_with_weights =  sorted(zip(nb_classifier.coef_[y_no], feature_names))
    
    # Print words most responsible for the prediction
#     print('Top 100 \n\n\n\n')
#     top_100_ls = []
    for i in range(100):
        x = feat_with_weights[-i-1]
#         top_100_ls.append(x[1])
#         print(nb_classifier.classes_[y_no], i, round((np.exp(1) ** x[0]),4), x[1])

#     print('\n\n\n\n Bottom 100 \n\n\n\n')
    for i in range(100):
        x = feat_with_weights[i]
#         print(nb_classifier.classes_[y_no], i, round((np.exp(1) ** x[0]),4), x[1])
    
#     min_weight = min([i[0] for i in feat_with_weights])
    
    x = [i[0] for i in feat_with_weights]
    
    median_pred = np.median(x)
          
    return [i[1] for i in feat_with_weights if i[0] <= median_pred] # Minimum weight words
#     return top_100_ls

# Find the least predictive words
def least_pred_words(nb_classifier, tfidf_vectorizer):
    low_wt_stop_ls = []

    for i in range(len(nb_classifier.classes_)):
        low_wt_stop_ls += get_feature_rank(tfidf_vectorizer, i, nb_classifier)

    low_wt_stop_ls = list(set(low_wt_stop_ls))
    return low_wt_stop_ls
    
low_wt_stop_ls = least_pred_words(nb_classifier, tfidf_vectorizer)

# Find top 100 words - doesn't seem to improve the model
def highest_pred_words(nb_classifier, tfidf_vectorizer):
    top_100_ls = []
    for i in range(len(nb_classifier.classes_)):
        top_100_ls += get_feature_rank(tfidf_vectorizer, i, nb_classifier)

    top_100_ls = list(set(top_100_ls))
    return top_100_ls