# Import Packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import gzip
import pandas as pd
import glob
import string
from sklearn.utils import resample
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle


# Import the data from MIMIC-III

Importing 8 tables from the MIMIC-III database, all of them are related to ICD and CPT codes and the related clinical notes. I store this information in a dictionary that I can pull from later on in the code.

In [2]:
dataset_dictionary = {}

for file_path in glob.glob('..\\Data\\MIMIC Files\*'):
    file_name = file_path.split('\\')[3].split('.')[0]
    with gzip.open(file_path, mode='r') as file:
        dataset_dictionary[file_name] = pd.read_csv(file)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
dataset_dictionary.keys()
# dataset_dictionary['D_ICD_DIAGNOSES']
# dataset_dictionary['D_CPT']

dict_keys(['CPTEVENTS', 'DIAGNOSES_ICD', 'D_CPT', 'D_ICD_DIAGNOSES', 'D_ICD_PROCEDURES', 'NOTEEVENTS', 'PATIENTS', 'PROCEDURES_ICD'])

In [4]:
dataset_dictionary['CPTEVENTS']

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,COSTCENTER,CHARTDATE,CPT_CD,CPT_NUMBER,CPT_SUFFIX,TICKET_ID_SEQ,SECTIONHEADER,SUBSECTIONHEADER,DESCRIPTION
0,317,11743,129545,ICU,,99232,99232.0,,6.0,Evaluation and management,Hospital inpatient services,
1,318,11743,129545,ICU,,99232,99232.0,,7.0,Evaluation and management,Hospital inpatient services,
2,319,11743,129545,ICU,,99232,99232.0,,8.0,Evaluation and management,Hospital inpatient services,
3,320,11743,129545,ICU,,99232,99232.0,,9.0,Evaluation and management,Hospital inpatient services,
4,321,6185,183725,ICU,,99223,99223.0,,1.0,Evaluation and management,Hospital inpatient services,
...,...,...,...,...,...,...,...,...,...,...,...,...
573141,573142,78876,163404,Resp,2105-09-01 00:00:00,94003,94003.0,,,Medicine,Pulmonary,VENT MGMT;SUBSQ DAYS(INVASIVE)
573142,573143,78879,136071,Resp,2150-08-29 00:00:00,94003,94003.0,,,Medicine,Pulmonary,VENT MGMT;SUBSQ DAYS(INVASIVE)
573143,573144,78879,136071,Resp,2150-08-28 00:00:00,94002,94002.0,,,Medicine,Pulmonary,"VENT MGMT, 1ST DAY (INVASIVE)"
573144,573145,78892,175171,Resp,2125-06-11 00:00:00,94003,94003.0,,,Medicine,Pulmonary,VENT MGMT;SUBSQ DAYS(INVASIVE)


# Assign Datatypes

This section cleans up the code and fixes any datatype issues.

In [5]:
# Check all the datasets exist in the dictionary 
print(dataset_dictionary.keys())

# Check the datatypes and information for each table 
for i in dataset_dictionary.keys():
    print(dataset_dictionary[i].info())

# Correct any datatype issues #####

# CPTEVENTS
dataset_dictionary['CPTEVENTS'].loc[:,['SECTIONHEADER','CPT_CD']] = dataset_dictionary['CPTEVENTS'].loc[:,['SECTIONHEADER','CPT_CD']].astype(str)
dataset_dictionary['CPTEVENTS']['CHARTDATE'] = dataset_dictionary['CPTEVENTS']['CHARTDATE'].to_datetime()

dict_keys(['CPTEVENTS', 'DIAGNOSES_ICD', 'D_CPT', 'D_ICD_DIAGNOSES', 'D_ICD_PROCEDURES', 'NOTEEVENTS', 'PATIENTS', 'PROCEDURES_ICD'])
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573146 entries, 0 to 573145
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ROW_ID            573146 non-null  int64  
 1   SUBJECT_ID        573146 non-null  int64  
 2   HADM_ID           573146 non-null  int64  
 3   COSTCENTER        573146 non-null  object 
 4   CHARTDATE         101545 non-null  object 
 5   CPT_CD            573146 non-null  object 
 6   CPT_NUMBER        573128 non-null  float64
 7   CPT_SUFFIX        22 non-null      object 
 8   TICKET_ID_SEQ     471601 non-null  float64
 9   SECTIONHEADER     573125 non-null  object 
 10  SUBSECTIONHEADER  573125 non-null  object 
 11  DESCRIPTION       101545 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 52.5+ MB
None
<class 'pandas.

AttributeError: 'Series' object has no attribute 'to_datetime'

# Join the tables

The ICD and CPT codes needed to be joined to the clinical notes. Also, a little bit of data cleaning needed to be done. The clinical notes were limited to just the discharge summaries since that helps to capture an overview of the patient encounter. The original number of clinical notes was 2,083,180 and filtering the category to discharge summary reduced it to 223,571 notes. The description type is also filtered to just report. The joins were made on the subject ID and HADM ID (encounter ID).

In [6]:
def join_tables(dataset_dictionary, category=['Discharge summary'], all_notes=False):

    # Define tables
    note_events_base = dataset_dictionary['NOTEEVENTS']
    cpt_events_base = dataset_dictionary['CPTEVENTS']
    icd_events_base = dataset_dictionary['DIAGNOSES_ICD']

    # Combine text for each subject and encounter
    if all_notes == False:
        note_events_base = note_events_base[note_events_base.loc[:,'CATEGORY'].isin(category)]
    
    # Filter out the addendums and restrict notes only to reports
    note_events_base = note_events_base[note_events_base['DESCRIPTION'] == 'Report']
    
    # Aggregate text by Subject and HADM ID's
    note_events = note_events_base.groupby(['SUBJECT_ID', 'HADM_ID'], as_index=False)['TEXT'].agg(sum)
    
    # Create CPT table
    cpt_events_base = cpt_events_base.loc[:, ['SUBJECT_ID','HADM_ID', 'CPT_CD', 'SECTIONHEADER', 'DESCRIPTION']]
    cpt_events = cpt_events_base.drop_duplicates()
    
    # Create ICD table
    icd_events_base = icd_events_base[icd_events_base['SEQ_NUM'] == 1]
    icd_events_base = icd_events_base.loc[:, ['SUBJECT_ID','HADM_ID', 'ICD9_CODE']]
    icd_events = icd_events_base.drop_duplicates()
    
    # Join the datasets
    note_cpt = note_events.merge(cpt_events, on = ['SUBJECT_ID','HADM_ID'])
    note_icd = note_events.merge(icd_events, on = ['SUBJECT_ID', 'HADM_ID'])
    
    # Replace any nulls with blanks
    x = note_cpt[note_cpt['DESCRIPTION'].isnull()].copy()
    x.loc[:,'DESCRIPTION'] = ''
    y = note_cpt[note_cpt['DESCRIPTION'].notnull()].copy()
    note_cpt = pd.concat([x,y])
    
    # Combine description and text columns
    note_cpt['TEXT'] = note_cpt['TEXT'] + note_cpt['DESCRIPTION']
    note_cpt = note_cpt.drop('DESCRIPTION', axis=1)
    
    return note_cpt, note_icd

# Run the function
note_cpt, note_icd = join_tables(dataset_dictionary)

# Drop notes with the nan sectionheader
drop_ls = note_cpt[note_cpt['SECTIONHEADER'] == 'nan']
note_cpt = note_cpt.drop(drop_ls.index)

In [7]:
dataset_dictionary['PATIENTS']

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00,,,,0
1,235,250,F,2164-12-27 00:00:00,2188-11-22 00:00:00,2188-11-22 00:00:00,,1
2,236,251,M,2090-03-15 00:00:00,,,,0
3,237,252,M,2078-03-06 00:00:00,,,,0
4,238,253,F,2089-11-26 00:00:00,,,,0
...,...,...,...,...,...,...,...,...
46515,31840,44089,M,2026-05-25 00:00:00,,,,0
46516,31841,44115,F,2124-07-27 00:00:00,,,,0
46517,31842,44123,F,2049-11-26 00:00:00,2135-01-12 00:00:00,2135-01-12 00:00:00,,1
46518,31843,44126,F,2076-07-25 00:00:00,,,,0


In [10]:
note_cpt.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,CPT_CD,SECTIONHEADER
2,4,185777.0,Admission Date: [**2191-3-16**] Discharge...,99223,Evaluation and management
3,4,185777.0,Admission Date: [**2191-3-16**] Discharge...,99233,Evaluation and management
4,4,185777.0,Admission Date: [**2191-3-16**] Discharge...,99232,Evaluation and management
5,4,185777.0,Admission Date: [**2191-3-16**] Discharge...,99231,Evaluation and management
6,4,185777.0,Admission Date: [**2191-3-16**] Discharge...,99238,Evaluation and management


# Filter the data - CPT

A number of different functions were made which helped to filter the CPT codes. This was part of the preprocessing process and the steps that were made here improved accuracy in the model a lot. In the end, I partitioned the CPT codes by the CPT Category Code Level 1. There are 6 sections. I took the top 5 from each one. I also filtered to only codes that had 100 notes or more to help the machine learning model learn.

In [8]:
def filter_df(combined_df, threshold):

    # Print value counts original
    print('Value Counts for the original data:\n\n', combined_df['CPT_CD'].value_counts().head(25))

    # Filter based on count limit
    df = combined_df['CPT_CD'].value_counts()
    filtered_ls = list((df[df >= threshold]).index.values)
    filtered_df = combined_df[combined_df['CPT_CD'].isin(filtered_ls)]
    
    # Print value counts filtered
    print('Value Counts for the filtered data:\n\n', combined_df['CPT_CD'].value_counts().head(25))

    return filtered_df

# Find Counts of CPT Codes per Patient Encounter and filter df
def cpt_count_filter(df, og_df, cpt_section_hadm_limit, cpt_hadm_limit):
    
    # Filter based on limit per SECTIONHEADER & merge
    df1 = og_df.groupby(['HADM_ID', 'SECTIONHEADER'])['CPT_CD'].count()
    filtered_encntrs = df1[df1 <= cpt_section_hadm_limit]
    final_df = df.merge(filtered_encntrs, on=['HADM_ID', 'SECTIONHEADER'])
    final_df.drop('CPT_CD_y', axis=1, inplace=True)
    
    # Filter dataset again based on total number of CPT codes per HADM
    df2 = og_df.groupby(['HADM_ID'])['CPT_CD'].count()
    filtered_encntrs = df2[df2 <= cpt_hadm_limit]
    final_df = final_df.merge(filtered_encntrs, on=['HADM_ID'])
    final_df.drop('CPT_CD', axis=1, inplace=True)
    
    # Rename columns
    final_df.columns = ['SUBJECT_ID', 'HADM_ID', 'TEXT', 'CPT_CD', 'SECTIONHEADER']
    
    print('Value Counts for the filtered data:\n\n', final_df['CPT_CD'].value_counts().head(50))

    return final_df
    
# Filter DataFrame to a set amount of CPT codes in each section header #####
def cpt_per_section_filter(df, section_limit, sections=['Emerging technology'], all_sections=False):
    
    # Create list for dataframes
    df_ls = []

    # Group by and count the number of CPT codes
    cts_by_cpt = df.groupby(['SECTIONHEADER', 'CPT_CD'])['CPT_CD'].count()
    cts_by_cpt.index.names = ['SECTIONHEADER', 'CPT_CDS']
    cts_by_cpt = cts_by_cpt.reset_index()
    cts_by_cpt.columns = ['SECTIONHEADER', 'CPT_CD', 'COUNT']

    # Sort values by section and CPT code count
    cts_by_cpt_s = cts_by_cpt.sort_values(by=['SECTIONHEADER','COUNT'], ascending=False)

    # Filter based on the limit of CPT codes wanted for each category
    if all_sections == True:
        sections = list(set(df['SECTIONHEADER']))
        
    for i in sections:
        top_cts = cts_by_cpt_s[cts_by_cpt_s['SECTIONHEADER'] == i].iloc[:section_limit,:]

        # Append to list
        df_ls.append(top_cts)

    # Combine DataFrames
    df_combo = pd.concat(df_ls)

    # Join back to source data
    final_df = df.merge(df_combo, on=['SECTIONHEADER','CPT_CD'])
    
    print('\nThe length of the initial dataset was {} and the new dataset is {}\n\n'.format(len(df), len(final_df)))

    return final_df

def show_section_counts(df):
    # Print count of CPT codes by section
    cts_by_cpt = df.groupby(['SECTIONHEADER', 'CPT_CD'])['CPT_CD'].count()
    cts_by_section = cts_by_cpt.groupby('SECTIONHEADER').count()
    print('\nHere are the counts by section:\n\n', cts_by_section)

def show_cpt_counts_by_section(df):
    cts_by_cpt = df.groupby(['SECTIONHEADER', 'CPT_CD'])['CPT_CD'].count()
    print('\nHere are the counts by CPT by section\n\n:')
#     print(pd.DataFrame(cts_by_cpt.rename('Count')).reset_index())
    for i, x in pd.DataFrame(cts_by_cpt.rename('Count')).reset_index().iterrows():
        print(x['SECTIONHEADER'], x['CPT_CD'], x['Count'])
        
def filter_cpt_ct(df, section, ct):
    df = df[df['SECTIONHEADER'] == section]
    top_codes = list(df['CPT_CD'].value_counts().head(ct).index)
    df = df[df.CPT_CD.isin(top_codes)]
    return df

In [9]:
# Filter the number of CPT occurrences by section and CPT
filtered_df1 = cpt_count_filter(note_cpt, note_cpt, 2, 10)

# Show available sections
print('Available sections:\n\n', set(filtered_df1['SECTIONHEADER']))

# Filter to a set amount of CPT codes for each section header - only here to make the code run faster when testing
filtered_df2 = cpt_per_section_filter(filtered_df1, 5, all_sections=True)

# Filter to those CPT codes that have at least 10 notes or more
filtered_df_cpt = filter_df(filtered_df2, 10)

# Filter total number of CPT codes in the dataset by section
# filtered_df_cpt = filter_cpt_ct(filtered_df_cpt, 'Evaluation and management', 3)

# Show some dataset stats
show_section_counts(filtered_df_cpt)
show_cpt_counts_by_section(filtered_df_cpt)

Value Counts for the filtered data:

 94003    12871
94002     8206
99291     4758
99232     2385
99233     1924
36556     1788
99254     1170
99231     1078
99223     1035
90935     1027
99222      970
99253      865
99255      700
36620      661
99238      485
31624      469
76942      447
33405      436
76937      352
31645      342
99252      334
31622      315
99292      313
99239      312
99221      268
90801      238
62270      225
31500      194
01996      178
99024      169
90945      154
90937      150
36489      145
32002      140
49080      128
61312      124
99251      111
43246      108
33427      107
93503      107
31600      105
33430       93
33533       91
92960       90
47135       89
33860       86
54150       84
32422       83
27245       80
32000       75
Name: CPT_CD, dtype: int64
Available sections:

 {'Medicine', 'Radiology', 'Evaluation and management', 'Emerging technology', 'Pathology and laboratory', 'Surgery', 'Anesthesia'}

The length of the initial datas

In [40]:
filtered_df_cpt[['SECTIONHEADER','CPT_CD']].value_counts().sort_index()

SECTIONHEADER              CPT_CD
Anesthesia                 01996       178
                           99141        23
                           99144        19
Evaluation and management  99231      1078
                           99232      2385
                           99233      1924
                           99254      1170
                           99291      4758
Medicine                   90801       238
                           90935      1027
                           94002      8206
                           94003     12871
                           99024       169
Pathology and laboratory   85060        22
Radiology                  75940        32
                           75989        19
                           76604        57
                           76937       352
                           76942       447
Surgery                    31624       469
                           31645       342
                           33405       436
                    

# Filter the Data - ICD

I filtered the CID codes so there wouldn't be any notes with less than 100 notes. I also filtered to the top 5 ICD-9 codes to increase accuracy.

In [17]:
def filter_df_icd(df, threshold):

    x = df['ICD9_CODE'].value_counts() > threshold
    y = list(x[x == 1].index)

    z = df[df['ICD9_CODE'].isin(y)]

    return z

def filter_icd_ct(df,ct):
    top_codes = list(df['ICD9_CODE'].value_counts().head(ct).index)
    df = df[df.ICD9_CODE.isin(top_codes)]
    return df

filtered_df_icd = filter_df_icd(note_icd, 100)
filtered_df_icd = filter_icd_ct(filtered_df_icd, 5)

In [18]:
filtered_df_icd[['ICD9_CODE']].value_counts()
# filtered_df_icd

ICD9_CODE
41401        3463
0389         1976
41071        1719
V3001        1390
4241         1136
dtype: int64

# Clean the data

This was another step that was completed to preprocess the data. Sections that were not relevant were removed, such as social history, family history, and medication. Other steps that were taken included lowercasing all letters, removing dates and location, replacing return characters, replacing punctuation, and replacing digits.

In [19]:
def clean_data(text_series, remove_sections):
  
    # Remove topics
    data = text_series.str.lower() # lowercase all letters
    data = data.str.split(r'(\n\n)')
    if remove_sections:
        for row_num, value in enumerate(data):
            text_chunks = [x.split(':', maxsplit=1) for x in value]
            ls = []
            for i, x in enumerate(text_chunks):
                try:
                    if x[0] != 'social history' or x[0] != 'family history' or 'medication' not in x[0]:
                        ls.append(x[1])
                except:
                    continue

            text_series.iloc[row_num] = ' '.join(ls)

        
    # Remove dates and locations
    text_series = text_series.str.replace('\[\*\*(.*?)\*\*\]', ' ', regex=True)
    
    # Replace \n 
    text_series = text_series.str.replace('\\n',' ', regex=True)  
    
    # Replace punctuation
    text_series = text_series.str.replace('[' + string.punctuation + ']', ' ', regex=True)
    
    # Remove all digits
    text_series = text_series.str.replace('\d',' ', regex=True)
    
    # Replace plurals, endings with ing, endings with ed, endings with ly
#     text_series = text_series.str.replace('s(?=\s)', ' ', regex=True)
#     text_series = text_series.str.replace('ing(?=\s)', ' ', regex=True)
#     text_series = text_series.str.replace('ed(?=\s)', ' ', regex=True)
#     text_series = text_series.str.replace('ly(?=\s)', ' ', regex=True)
    
    return text_series

# Update Text Column -----
filtered_df_cpt['TEXT'] = clean_data(filtered_df_cpt['TEXT'], True)
filtered_df_icd['TEXT'] = clean_data(filtered_df_icd['TEXT'], True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_cpt['TEXT'] = clean_data(filtered_df_cpt['TEXT'], True)


# Select CPT Code Sections to Run

Set the variable sections to the list of CPT code sections. Anything not in those sections will have the option to grouped together into a category called 'other'.

In [20]:
sections = list(set(filtered_df_cpt['SECTIONHEADER']))
print(sections)
sections = ['Evaluation and management', 'Surgery', 'Medicine', 'Radiology']
print(sections)

['Anesthesia', 'Surgery', 'Medicine', 'Radiology', 'Evaluation and management', 'Pathology and laboratory']
['Evaluation and management', 'Surgery', 'Medicine', 'Radiology']


# Split the Data

Define a function to split the CPT code data. Sets test dataset size to 30% and training to 70%. Also shuffles the data to remove any bias implicit in the data ordering. Finally, combines CPT sections not selected into an other category in order to increase the number of notes in that section.

In [21]:
tt_dict = {} 

def split_stratify_df(df, sections, combine_others=False, all_=False):
    
    # Train test split for each section selected
    if all_:
        sections.clear()
    else:
        for i in sections:
            
            # Test and training
            df_x = df[df['SECTIONHEADER'] == i]['TEXT'].values
            df_y = df[df['SECTIONHEADER'] == i]['CPT_CD']
            tt_dict['X_train_' + i], tt_dict['X_test_' + i], tt_dict['y_train_' + i], tt_dict['y_test_' + i], \
            tt_dict['index_train_' + i], tt_dict['index_test_' + i] = \
            train_test_split(df_x, df_y, range(len(df_y)), test_size = .3, random_state = 42, shuffle=True)
            
            # Validation
            tt_dict['X_test_' + i], tt_dict['X_val_' + i], tt_dict['y_test_' + i], tt_dict['y_val_' + i], \
            tt_dict['index_test_' + i], tt_dict['index_val_' + i] = \
            train_test_split(tt_dict['X_test_' + i], tt_dict['y_test_' + i], range(len(tt_dict['y_test_' + i])), \
                             test_size = .05, random_state = 42, shuffle=True)
            
    # Group other sections not included in selection
    if combine_others:
        i = 'other'
        other_sections = list(set(df['SECTIONHEADER']).difference(set(sections)))
        sections.append(i)
        df_x = df[df['SECTIONHEADER'].isin(other_sections)]['TEXT'].values
        df_y = df[df['SECTIONHEADER'].isin(other_sections)]['CPT_CD']
        
        # Test and training
        tt_dict['X_train_' + i], tt_dict['X_test_' + i], tt_dict['y_train_' + i], tt_dict['y_test_' + i], \
        tt_dict['index_train_' + i], tt_dict['index_test_' + i] = \
        train_test_split(df_x, df_y, range(len(df_y)), test_size = .3, random_state = 42, shuffle=True)
        
        # Validation
        tt_dict['X_test_' + i], tt_dict['X_val_' + i], tt_dict['y_test_' + i], tt_dict['y_val_' + i], \
        tt_dict['index_test_' + i], tt_dict['index_val_' + i] = \
        train_test_split(tt_dict['X_test_' + i], tt_dict['y_test_' + i], range(len(tt_dict['y_test_' + i])), \
                         test_size = .05, random_state = 42, shuffle=True)
        
split_stratify_df(filtered_df_cpt, sections, True)

# Select ICD sections to run

In [22]:
sections.append('icd')
print(sections)

['Evaluation and management', 'Surgery', 'Medicine', 'Radiology', 'other', 'icd']


# Split the Data - ICD

In [23]:
# Training and test
tt_dict['X_train_icd'], tt_dict['X_test_icd'] , tt_dict['y_train_icd'], tt_dict['y_test_icd'] = \
train_test_split(filtered_df_icd['TEXT'].values, filtered_df_icd['ICD9_CODE'], test_size = .3, random_state = 42, shuffle=True)

# Test and Validation
tt_dict['X_test_icd'], tt_dict['X_val_icd'] , tt_dict['y_test_icd'], tt_dict['y_val_icd'] = \
train_test_split(tt_dict['X_test_icd'], tt_dict['y_test_icd'], test_size = .05, random_state = 42, shuffle=True)


# Balance the data

After splitting the data, the data needs to be balanced due to the dataset imbalance. Datasets are both under and oversampled based on the 99th percentile of the largest CPT/ICD-9 code. This improves the final accuracy of the model.

In [24]:
def oversample_df(X_train, y_train, percentile):
            
    # Recombine the training dataset
    x = pd.Series(X_train).reset_index(drop=True)
    y = pd.Series(y_train).reset_index(drop=True)
    training_df = pd.concat([x,y], axis=1, ignore_index=True)

    # Check counts
    df_cts = training_df.iloc[:,1].value_counts()
    record_ct = round(np.percentile(df_cts, percentile))
    print('New Balanced Record Count per feature: {}'.format(record_ct))
    
    # Create a list of CPT values
    df = list(df_cts.index.values)

    # Resample
    minority_df = []
    for i in df:
        test_resampled = resample(training_df[training_df.iloc[:,1] == i], replace=True, n_samples=record_ct, random_state=123)
        minority_df.append(test_resampled)
    
    # Create final dataframe
    new_df = pd.concat(minority_df)
    
    return new_df

def balance(tt_dict, sections):
    for section in sections:
        print(section)
        
        # Running balance function
        training_balanced = oversample_df(tt_dict['X_train_' + section], tt_dict['y_train_' + section] , 99)
        
        # Reassign balanced data
        tt_dict['X_train_' + section] = np.array(training_balanced.iloc[:,0].values)
        tt_dict['y_train_' + section] = np.array(training_balanced.iloc[:,1].values)

# Run the functions
balance(tt_dict, sections)

Evaluation and management
New Balanced Record Count per feature: 3281
Surgery
New Balanced Record Count per feature: 1230
Medicine
New Balanced Record Count per feature: 8878
Radiology
New Balanced Record Count per feature: 306
other
New Balanced Record Count per feature: 122
icd
New Balanced Record Count per feature: 2397


# Tokenize the data

To run the NLP model, the data needs to be tokenized, which means the words need to be counted for each document. The vectorizer used is the TF-IDF vectorizer. This vectorizer is defined by this equation: TF(t,d) * IDF(t), where t is term frequency, d is the number of times term t appears in a document and IDF stands for the inverse document frequency. IDF is defined as log (1 + n/1 + df9d,t) + 1. N is the number of documents and df is the document frequency of t. There is one vectorizer made for each CPT/ICD-9 section. There are 6 sections total, including the ICD section. The hyperparameters were tuned using grid search in one of the draft notebooks.

In [25]:
# Define stop words
my_stop_words = list(set(stopwords.words('english'))) \
                + ['admission', 'date', 'sex'] \
                + ['needed', 'every', 'seen', 'weeks', 'please', 'ml', 'unit', 'small', 'year', 'old', 'cm', 'non', 'mm', 'however']
                # Got the above from my top 100 most predictive words that I wanted to remove

# Set Dictionary for vectorized words
vectorized_words = {}
    
def vectorize_df(train_test_dict, sections):

    for section in sections:
        
        # Import TfidfVectorizer
        vectorized_words['tfidf_vectorizer_' + section] = TfidfVectorizer(stop_words=my_stop_words, max_df=.7, min_df = 2, sublinear_tf = True, ngram_range = (1, 2))

        # Transform the training data
        tfidf_train = vectorized_words['tfidf_vectorizer_' + section].fit_transform(train_test_dict['X_train_' + section])

        # Transform the test data
        tfidf_test = vectorized_words['tfidf_vectorizer_' + section].transform(train_test_dict['X_test_' + section])

        # Add results to dictionary

        vectorized_words['tfidf_train_' + section] = tfidf_train
        vectorized_words['tfidf_test_' + section] = tfidf_test

vectorize_df(tt_dict, sections)   


# Logistic Regression

Logistic regression was chosen after considering 6 ML models in total, these included: Decision Tree, Random Forest, XG Boost, Multinomail Naive Bayes, KNN, and One vs Rest Logistic Regression. The hyperparameters were tuned in another notebook using gridsearch. 

In [26]:
from sklearn.linear_model import LogisticRegression

models_lr = {}

def run_clf(train_test_dict, sections):

    # Fit and check accuracy
    for section in sections:
        
        # Use Naive Bayes model
        clf = LogisticRegression(random_state=123, C=1, max_iter=25, solver='sag', n_jobs=-1)
        
        # Fit model
        clf.fit(vectorized_words['tfidf_train_' + section], train_test_dict['y_train_' + section])
        
        # Save in dictionary
        models_lr[section] = clf
    
run_clf(tt_dict, sections)



# Evaluation

This function checked the overall accuracy for each of the 6 models.

In [27]:
# models_ls = [models_nb, models_rf, models_xg, models_knn, models_lr, models_tree]
models_ls = [models_lr]

# Check accuracy
def predictions(tt_dict, models, section=[]):
    for key, model in models.items():
        if len(section) == 0 or key == section:
            pred = model.predict(vectorized_words['tfidf_test_' + key])
            print(key)
            print(metrics.accuracy_score(tt_dict['y_test_' + key], pred))

predictions(tt_dict, models_lr)

Evaluation and management
0.3234108527131783
Surgery
0.597340930674264
Medicine
0.836502493765586
Radiology
0.6872586872586872
other
0.8405797101449275
icd
0.8891304347826087


# Classification Report

The classification report can be run for each model to find the precision, recall, and f-1 score to measure the models performance.

In [91]:
# Create classification report taken from here: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
print('Test')
section = 'other'
pred = models_lr[section].predict(vectorized_words['tfidf_test_' + section])
print(classification_report(tt_dict['y_test_' + section], pred))

# print('Training')
# pred_x = nb_classifier.predict(vectorized_words['tfidf_train'])
# print(classification_report(tt_dict['y_train_' + section], pred_x))

Test
              precision    recall  f1-score   support

       01996       0.89      0.94      0.92        52
       85060       0.80      0.57      0.67         7
       99141       0.67      0.50      0.57         4
       99144       0.50      0.50      0.50         6

    accuracy                           0.84        69
   macro avg       0.71      0.63      0.66        69
weighted avg       0.83      0.84      0.83        69



# Make Predictions

Predictions can be made using each model. The prediction probability was tested to be used in the front-end of the app.

In [29]:
def predict_cpt(text, section):
    input_text_clean = clean_data(pd.Series(text), False)
    tfidf_input_test = vectorized_words['tfidf_vectorizer_' + section].transform(input_text_clean)
    clf = models_lr[section]
    
    clf.predict(tfidf_input_test)
    print(clf.predict(tfidf_input_test), str(round(max(clf.predict_proba(tfidf_input_test)[0]) * 100,2)) + '%')
#     print(clf.predict_proba(tfidf_input_test)[0])

# Set Variables
txt_record_no = 1
section = 'icd'

# Run Function
predict_cpt(tt_dict['X_val_' + section][txt_record_no], section)

# print(np.array(tt_dict['y_val_' + section])[txt_record_no])

['0389'] 96.32%
0389


# Look at the Most/Least Predictive Features

The top 100 and bottom 100 features are saved into the features_df dataframe. These are saved to csv files and later used in the front-end of the app using streamlit.

In [234]:
def get_features(sections):

    # Initialize dataframe list
    df_ls = []
    
    for section in sections: 
    
        # Loop through for each class
        for index, class_ in enumerate(models_lr[section].classes_):
            
            print('Class ' + str(index) + ' & Section: ' + section)

            # Get the feature names
            feature_names = vectorized_words['tfidf_vectorizer_' + section].get_feature_names()

            # Get the probabilities
            # Source: # https://sebastiansauer.github.io/convert_logit2prob/ for converting odds to log odds
            probs = [np.exp(x)/(1 + np.exp(x)) for x in models_lr[section].coef_[index]]

            # Zip together the first CPT weights with feature names
            feat_with_weights =  sorted(zip(probs, feature_names))
            feat_with_weights_r = feat_with_weights[::-1]

            # Bottom 100 dataframe
            bottom_100 = pd.DataFrame(feat_with_weights[:100], columns = ['Prob','Features'])
            bottom_100['Class'] = models_lr[section].classes_[index]
            bottom_100['Direction'] = 'Bottom'
            bottom_100['Section'] = section

            # Top 100 dataframe
            top_100 = pd.DataFrame(feat_with_weights_r[:100], columns = ['Prob','Features'])
            top_100['Class'] = models_lr[section].classes_[index]
            top_100['Direction'] = 'Top'
            top_100['Section'] = section

            # Add dataframes to list
            df_ls.append(bottom_100)
            df_ls.append(top_100)

    return pd.concat(df_ls)

features_df = get_features(sections)

Class 0 & Section: Evaluation and management
Class 1 & Section: Evaluation and management
Class 2 & Section: Evaluation and management
Class 3 & Section: Evaluation and management
Class 4 & Section: Evaluation and management
Class 0 & Section: Surgery
Class 1 & Section: Surgery
Class 2 & Section: Surgery
Class 3 & Section: Surgery
Class 4 & Section: Surgery
Class 0 & Section: Medicine
Class 1 & Section: Medicine
Class 2 & Section: Medicine
Class 3 & Section: Medicine
Class 4 & Section: Medicine
Class 0 & Section: Radiology
Class 1 & Section: Radiology
Class 2 & Section: Radiology
Class 3 & Section: Radiology
Class 4 & Section: Radiology
Class 0 & Section: other
Class 1 & Section: other
Class 2 & Section: other
Class 3 & Section: other
Class 0 & Section: icd
Class 1 & Section: icd
Class 2 & Section: icd
Class 3 & Section: icd
Class 4 & Section: icd


In [244]:
# Save the features
features_df.to_csv('features_df.csv')

# Save the Models

Each model is saved for later use in the front-end Streamlit app.

In [153]:
# Taken from here: https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
# save the model to disk

for key, model in models_lr.items():
    filename = 'finalized_model_' + key + '.sav'
    pickle.dump(model, open(filename, 'wb'))

# Save the Fitted Vectorizers

The vectorizers are saved for later use in the front-end Streamlit app.

In [239]:
for key, vectorizer in vectorized_words.items():
    if 'vectorizer' in key:
        filename = 'finalized_vectorizer_' + key + '.sav'
        pickle.dump(vectorizer, open(filename, 'wb'))

# Save the Text Sample Files

The text sample files are saved to use in the Streamlit app to generate examples of the model

In [248]:
for key, value in tt_dict.items():
    if 'X_val_' in key or 'y_val_' in key:
        pd.Series(value).to_csv(key + '.csv')