# Import Packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import gzip
import pandas as pd
import glob
import string
from sklearn.utils import resample
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle


# Import the MIMIC data

In [2]:
dataset_dictionary = {}

for file_path in glob.glob('..\\Data\\MIMIC Files\*'):
    file_name = file_path.split('\\')[3].split('.')[0]
    with gzip.open(file_path, mode='r') as file:
        dataset_dictionary[file_name] = pd.read_csv(file)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Assign Datatypes

In [3]:
# Check all the datasets exist in the dictionary 
print(dataset_dictionary.keys())

# Check the datatypes and information for each table 
for i in dataset_dictionary.keys():
    print(dataset_dictionary[i].info())

# Correct any datatype issues #####

# CPTEVENTS
dataset_dictionary['CPTEVENTS'].loc[:,['SECTIONHEADER','CPT_CD']] = dataset_dictionary['CPTEVENTS'].loc[:,['SECTIONHEADER','CPT_CD']].astype(str)
dataset_dictionary['CPTEVENTS']['CHARTDATE'] = dataset_dictionary['CPTEVENTS']['CHARTDATE'].to_datetime()

dict_keys(['CPTEVENTS', 'DIAGNOSES_ICD', 'D_CPT', 'D_ICD_DIAGNOSES', 'D_ICD_PROCEDURES', 'NOTEEVENTS', 'PATIENTS', 'PROCEDURES_ICD'])
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573146 entries, 0 to 573145
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ROW_ID            573146 non-null  int64  
 1   SUBJECT_ID        573146 non-null  int64  
 2   HADM_ID           573146 non-null  int64  
 3   COSTCENTER        573146 non-null  object 
 4   CHARTDATE         101545 non-null  object 
 5   CPT_CD            573146 non-null  object 
 6   CPT_NUMBER        573128 non-null  float64
 7   CPT_SUFFIX        22 non-null      object 
 8   TICKET_ID_SEQ     471601 non-null  float64
 9   SECTIONHEADER     573125 non-null  object 
 10  SUBSECTIONHEADER  573125 non-null  object 
 11  DESCRIPTION       101545 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 52.5+ MB
None
<class 'pandas.

AttributeError: 'Series' object has no attribute 'to_datetime'

In [12]:
# ------------------
    # -- QA
# ------------------

# Check that the ICD9 code column is distinct
print(len(set(dataset_dictionary['D_ICD_DIAGNOSES']['ICD9_CODE'])))
print(len(dataset_dictionary['D_ICD_DIAGNOSES']))

dataset_dictionary['D_ICD_DIAGNOSES']

14567
14567


Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,01166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,01170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,01171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,01172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,01173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."
...,...,...,...,...
14562,14432,V7399,Scrn unspcf viral dis,Special screening examination for unspecified ...
14563,14433,V740,Screening for cholera,Screening examination for cholera
14564,14434,V741,Screening-pulmonary TB,Screening examination for pulmonary tuberculosis
14565,14435,V742,Screening for leprosy,Screening examination for leprosy (Hansen's di...


# Join the tables

In [146]:
def join_tables(dataset_dictionary, category=['Discharge summary'], all_notes=False):

    # Define tables
    note_events_base = dataset_dictionary['NOTEEVENTS']
    cpt_events_base = dataset_dictionary['CPTEVENTS']
    icd_events_base = dataset_dictionary['DIAGNOSES_ICD']
    icd_detail_base = dataset_dictionary['D_ICD_DIAGNOSES'].loc[:,['ICD9_CODE', 'LONG_TITLE']]

    # Combine text for each subject and encounter
    if all_notes == False:
        note_events_base = note_events_base[note_events_base.loc[:,'CATEGORY'].isin(category)]
    
    # Filter out the addendums and restrict notes only to reports
    note_events_base = note_events_base[note_events_base['DESCRIPTION'] == 'Report']
    
    # Aggregate text by Subject and HADM ID's
    note_events = note_events_base.groupby(['SUBJECT_ID', 'HADM_ID'], as_index=False)['TEXT'].agg(sum)
    
    # Create CPT table
    cpt_events_base = cpt_events_base.loc[:, ['SUBJECT_ID','HADM_ID', 'CPT_CD', 'SECTIONHEADER', 'DESCRIPTION']]
    cpt_events = cpt_events_base.drop_duplicates()
    
    # Create ICD table
    icd_events_base = icd_events_base[icd_events_base['SEQ_NUM'] == 1]
    icd_events_base = icd_events_base.loc[:, ['SUBJECT_ID','HADM_ID', 'ICD9_CODE']]
    icd_events = icd_events_base.drop_duplicates()
    
    # Join the datasets
    note_cpt = note_events.merge(cpt_events, on = ['SUBJECT_ID','HADM_ID'])\
                          .merge(icd_events, on = ['SUBJECT_ID', 'HADM_ID'])\
                          .merge(icd_detail_base, on = 'ICD9_CODE')
    note_icd = note_events.merge(icd_events, on = ['SUBJECT_ID', 'HADM_ID'])\
                          .merge(icd_detail_base, on = 'ICD9_CODE')
    
    # CPT - Replace any nulls with blanks
    x = note_cpt[note_cpt['DESCRIPTION'].isnull()].copy()
    x.loc[:,'DESCRIPTION'] = ''
    y = note_cpt[note_cpt['DESCRIPTION'].notnull()].copy()
    note_cpt = pd.concat([x,y])
    
    # CPT - Combine description and text columns
    note_cpt['TEXT'] = note_cpt['TEXT'] + note_cpt['DESCRIPTION']
    note_cpt = note_cpt.drop('DESCRIPTION', axis=1)
    
    # CPT - Combine ICD description with text for specific sections
    ls_to_add = ['Anesthesia','Pathology and laboratory', 'Radiology']
    ls_to_keep = list(set(note_cpt['SECTIONHEADER']).difference(set(ls_to_add)))
    
    note_cpt_add = note_cpt[note_cpt.SECTIONHEADER.isin(ls_to_add)]
    note_cpt_keep = note_cpt[note_cpt.SECTIONHEADER.isin(ls_to_keep)]
    note_cpt_add['TEXT'] = note_cpt_add['TEXT'] + ' ' + note_cpt_add['LONG_TITLE']
    note_cpt = pd.concat([note_cpt_add, note_cpt_keep])
        
    return note_cpt, note_icd

# Run the function
note_cpt, note_icd = join_tables(dataset_dictionary)

# Drop notes with the nan sectionheader
drop_ls = note_cpt[note_cpt['SECTIONHEADER'] == 'nan']
note_cpt = note_cpt.drop(drop_ls.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  note_cpt_add['TEXT'] = note_cpt_add['TEXT'] + ' ' + note_cpt_add['LONG_TITLE']


In [147]:
# ICD - Update column names
note_icd.columns = ['SUBJECT_ID', 'HADM_ID', 'CLINICAL_TEXT', 'ICD9_CODE', 'TEXT']

# Filter the data - CPT

In [208]:
def filter_df(combined_df, threshold):

    # Print value counts original
    print('Value Counts for the original data:\n\n', combined_df['CPT_CD'].value_counts().head(25))

    # Filter based on count limit
    df = combined_df['CPT_CD'].value_counts()
    filtered_ls = list((df[df >= threshold]).index.values)
    filtered_df = combined_df[combined_df['CPT_CD'].isin(filtered_ls)]
    
    # Print value counts filtered
    print('Value Counts for the filtered data:\n\n', combined_df['CPT_CD'].value_counts().head(25))

    return filtered_df

# Find Counts of CPT Codes per Patient Encounter and filter df
def cpt_count_filter(df, og_df, cpt_section_hadm_limit, cpt_hadm_limit):
    
    # Filter based on limit per SECTIONHEADER & merge
    df1 = og_df.groupby(['HADM_ID', 'SECTIONHEADER'])['CPT_CD'].count()
    filtered_encntrs = df1[df1 <= cpt_section_hadm_limit]
    final_df = df.merge(filtered_encntrs, on=['HADM_ID', 'SECTIONHEADER'])
    final_df.drop('CPT_CD_y', axis=1, inplace=True)
    
    # Filter dataset again based on total number of CPT codes per HADM
    df2 = og_df.groupby(['HADM_ID'])['CPT_CD'].count()
    filtered_encntrs = df2[df2 <= cpt_hadm_limit]
    final_df = final_df.merge(filtered_encntrs, on=['HADM_ID'])
    final_df.drop('CPT_CD', axis=1, inplace=True)
    
    # Rename columns
    final_df.columns = ['SUBJECT_ID', 'HADM_ID', 'TEXT', 'CPT_CD', 'SECTIONHEADER', 'ICD9_CODE', 'LONG_TITLE']
    
    print('Value Counts for the filtered data:\n\n', final_df['CPT_CD'].value_counts().head(50))

    return final_df
    
# Filter DataFrame to a set amount of CPT codes in each section header #####
def cpt_per_section_filter(df, section_limit, sections=['Emerging technology'], all_sections=False):
    
    # Create list for dataframes
    df_ls = []

    # Group by and count the number of CPT codes
    cts_by_cpt = df.groupby(['SECTIONHEADER', 'CPT_CD'])['CPT_CD'].count()
    cts_by_cpt.index.names = ['SECTIONHEADER', 'CPT_CDS']
    cts_by_cpt = cts_by_cpt.reset_index()
    cts_by_cpt.columns = ['SECTIONHEADER', 'CPT_CD', 'COUNT']

    # Sort values by section and CPT code count
    cts_by_cpt_s = cts_by_cpt.sort_values(by=['SECTIONHEADER','COUNT'], ascending=False)

    # Filter based on the limit of CPT codes wanted for each category
    if all_sections == True:
        sections = list(set(df['SECTIONHEADER']))
        
    for i in sections:
        top_cts = cts_by_cpt_s[cts_by_cpt_s['SECTIONHEADER'] == i].iloc[:section_limit,:]

        # Append to list
        df_ls.append(top_cts)

    # Combine DataFrames
    df_combo = pd.concat(df_ls)

    # Join back to source data
    final_df = df.merge(df_combo, on=['SECTIONHEADER','CPT_CD'])
    
    print('\nThe length of the initial dataset was {} and the new dataset is {}\n\n'.format(len(df), len(final_df)))

    return final_df

def show_section_counts(df):
    # Print count of CPT codes by section
    cts_by_cpt = df.groupby(['SECTIONHEADER', 'CPT_CD'])['CPT_CD'].count()
    cts_by_section = cts_by_cpt.groupby('SECTIONHEADER').count()
    print('\nHere are the counts by section:\n\n', cts_by_section)

def show_cpt_counts_by_section(df):
    cts_by_cpt = df.groupby(['SECTIONHEADER', 'CPT_CD'])['CPT_CD'].count()
    print('\nHere are the counts by CPT by section\n\n:')
#     print(pd.DataFrame(cts_by_cpt.rename('Count')).reset_index())
    for i, x in pd.DataFrame(cts_by_cpt.rename('Count')).reset_index().iterrows():
        print(x['SECTIONHEADER'], x['CPT_CD'], x['Count'])
        
def filter_cpt_ct(df, section, ct):
    df = df[df['SECTIONHEADER'] == section]
    top_codes = list(df['CPT_CD'].value_counts().head(ct).index)
    df = df[df.CPT_CD.isin(top_codes)]
    return df

In [209]:
# Filter the number of CPT occurrences by section and CPT
filtered_df1 = cpt_count_filter(note_cpt, note_cpt, 2, 10)

# Show available sections
print('Available sections:\n\n', set(filtered_df1['SECTIONHEADER']))

# Filter to a set amount of CPT codes for each section header - only here to make the code run faster when testing
filtered_df2 = cpt_per_section_filter(filtered_df1, 5, all_sections=True)

# Filter to those CPT codes that have at least 100 notes or more
filtered_df_cpt = filter_df(filtered_df2, 10)

# Filter total number of CPT codes in the dataset by section
# filtered_df_cpt = filter_cpt_ct(filtered_df_cpt, 'Evaluation and management', 3)

# Show some dataset stats
show_section_counts(filtered_df_cpt)
show_cpt_counts_by_section(filtered_df_cpt)

Value Counts for the filtered data:

 94003    12672
94002     8100
99291     4716
99232     2361
99233     1901
36556     1765
99254     1151
99231     1067
99223     1021
90935     1014
99222      961
99253      852
99255      689
36620      650
99238      476
31624      457
76942      437
33405      435
76937      350
31645      328
99252      327
99292      310
99239      308
31622      302
99221      264
90801      236
62270      223
31500      192
01996      178
99024      168
90945      153
90937      149
36489      142
32002      139
49080      127
61312      124
99251      108
43246      108
33427      107
93503      104
31600      104
33430       92
33533       91
92960       90
47135       89
33860       86
54150       84
32422       81
27245       80
32000       74
Name: CPT_CD, dtype: int64
Available sections:

 {'Medicine', 'Pathology and laboratory', 'Radiology', 'Surgery', 'Emerging technology', 'Evaluation and management', 'Anesthesia'}

The length of the initial datas

In [210]:
filtered_df_cpt[['SECTIONHEADER','CPT_CD']].value_counts().sort_index()

SECTIONHEADER              CPT_CD
Anesthesia                 01996       178
                           99141        22
                           99144        19
Evaluation and management  99231      1067
                           99232      2361
                           99233      1901
                           99254      1151
                           99291      4716
Medicine                   90801       236
                           90935      1014
                           94002      8100
                           94003     12672
                           99024       168
Pathology and laboratory   85060        22
Radiology                  75940        32
                           75989        19
                           76604        55
                           76937       350
                           76942       437
Surgery                    31624       457
                           31645       328
                           33405       435
                    

# Filter the Data - ICD

In [218]:
def filter_df_icd(df, threshold):

    x = df['ICD9_CODE'].value_counts() > threshold
    y = list(x[x == 1].index)

    z = df[df['ICD9_CODE'].isin(y)]

    return z

def filter_icd_ct(df,ct):
    top_codes = list(df['ICD9_CODE'].value_counts().head(ct).index)
    df = df[df.ICD9_CODE.isin(top_codes)]
    return df

filtered_df_icd = filter_df_icd(note_icd, 100)
filtered_df_icd = filter_icd_ct(filtered_df_icd, 5)

In [219]:
filtered_df_icd

Unnamed: 0,SUBJECT_ID,HADM_ID,CLINICAL_TEXT,ICD9_CODE,TEXT
0,3,145834.0,Admission Date: [**2101-10-20**] Discharg...,0389,Unspecified septicemia
1,33,176176.0,Admission Date: [**2116-12-23**] Discharg...,0389,Unspecified septicemia
2,85,112077.0,Admission Date: [**2167-7-25**] ...,0389,Unspecified septicemia
3,111,155897.0,Admission Date: [**2144-7-1**] D...,0389,Unspecified septicemia
4,112,173177.0,Admission Date: [**2196-9-27**] ...,0389,Unspecified septicemia
...,...,...,...,...,...
15082,32588,161808.0,Unit No: [**Numeric Identifier 74734**]\nAdmi...,V3001,"Single liveborn, born in hospital, delivered b..."
15083,32596,112009.0,Admission Date: [**2184-11-1**] Dischar...,V3001,"Single liveborn, born in hospital, delivered b..."
15084,32603,182063.0,Unit No: [**Numeric Identifier 75844**]\nAdmi...,V3001,"Single liveborn, born in hospital, delivered b..."
15085,32641,168336.0,Admission Date: [**2152-10-19**] Discha...,V3001,"Single liveborn, born in hospital, delivered b..."


# Clean the data

In [220]:
def clean_data(text_series, remove_sections):
  
    # lowercase all letters
    text_series = text_series.str.lower() 

    # Remove topics
    if remove_sections:
        data = text_series.str.split(r'(\n\n)')
        for row_num, value in enumerate(data):
            text_chunks = [x.split(':', maxsplit=1) for x in value]
            ls = []
            for i, x in enumerate(text_chunks):
                try:
                    if x[0] != 'social history' or x[0] != 'family history' or 'medication' not in x[0]:
                        ls.append(x[1])
                except:
                    continue

            text_series.iloc[row_num] = ' '.join(ls)

        
    # Remove dates and locations
    text_series = text_series.str.replace('\[\*\*(.*?)\*\*\]', ' ', regex=True)
    
    # Replace \n 
    text_series = text_series.str.replace('\\n',' ', regex=True)  
    
    # Replace punctuation
    text_series = text_series.str.replace('[' + string.punctuation + ']', ' ', regex=True)
    
    # Remove all digits
    text_series = text_series.str.replace('\d',' ', regex=True)
    
    # Replace plurals, endings with ing, endings with ed, endings with ly
#     text_series = text_series.str.replace('s(?=\s)', ' ', regex=True)
#     text_series = text_series.str.replace('ing(?=\s)', ' ', regex=True)
#     text_series = text_series.str.replace('ed(?=\s)', ' ', regex=True)
#     text_series = text_series.str.replace('ly(?=\s)', ' ', regex=True)
    
    return text_series

# Update Text Column -----
# filtered_df_cpt['TEXT'] = clean_data(filtered_df_cpt['TEXT'], True)
filtered_df_icd['TEXT'] = clean_data(filtered_df_icd['TEXT'], False)
filtered_df_icd['CLINICAL_TEXT'] = clean_data(filtered_df_icd['CLINICAL_TEXT'], True)


# Label Encode the Predictors

In [44]:
le = preprocessing.LabelEncoder()
filtered_df['CPT_CD'] = le.fit_transform(filtered_df['CPT_CD'])

# Select CPT sections to Run

In [221]:
sections = list(set(filtered_df_cpt['SECTIONHEADER']))
print(sections)
sections = []
print(sections)

['Medicine', 'Pathology and laboratory', 'Radiology', 'Surgery', 'Evaluation and management', 'Anesthesia']
[]


# Split the Data - CPT

In [216]:
tt_dict = {} 

def split_stratify_df(df, sections, combine_others=False, all_=False):
    
    # Train test split for each section selected
    if all_:
        sections.clear()
    else:
        for i in sections:
            
            # Test and training
            df_x = df[df['SECTIONHEADER'] == i]['TEXT'].values
            df_y = df[df['SECTIONHEADER'] == i]['CPT_CD']
            tt_dict['X_train_' + i], tt_dict['X_test_' + i], tt_dict['y_train_' + i], tt_dict['y_test_' + i], \
            tt_dict['index_train_' + i], tt_dict['index_test_' + i] = \
            train_test_split(df_x, df_y, range(len(df_y)), test_size = .3, random_state = 42, shuffle=True)
            
            # Validation
            tt_dict['X_test_' + i], tt_dict['X_val_' + i], tt_dict['y_test_' + i], tt_dict['y_val_' + i], \
            tt_dict['index_test_' + i], tt_dict['index_val_' + i] = \
            train_test_split(tt_dict['X_test_' + i], tt_dict['y_test_' + i], range(len(tt_dict['y_test_' + i])), \
                             test_size = .05, random_state = 42, shuffle=True)
            
    # Group other sections not included in selection
    if combine_others:
        i = 'other'
        other_sections = list(set(df['SECTIONHEADER']).difference(set(sections)))
        sections.append(i)
        df_x = df[df['SECTIONHEADER'].isin(other_sections)]['TEXT'].values
        df_y = df[df['SECTIONHEADER'].isin(other_sections)]['CPT_CD']
        
        # Test and training
        tt_dict['X_train_' + i], tt_dict['X_test_' + i], tt_dict['y_train_' + i], tt_dict['y_test_' + i], \
        tt_dict['index_train_' + i], tt_dict['index_test_' + i] = \
        train_test_split(df_x, df_y, range(len(df_y)), test_size = .3, random_state = 42, shuffle=True)
        
        # Validation
        tt_dict['X_test_' + i], tt_dict['X_val_' + i], tt_dict['y_test_' + i], tt_dict['y_val_' + i], \
        tt_dict['index_test_' + i], tt_dict['index_val_' + i] = \
        train_test_split(tt_dict['X_test_' + i], tt_dict['y_test_' + i], range(len(tt_dict['y_test_' + i])), \
                         test_size = .05, random_state = 42, shuffle=True)
        
split_stratify_df(filtered_df_cpt, sections)

# Select ICD sections to run

In [222]:
sections.append('icd')
print(sections)

['icd']


In [259]:
filtered_df_icd['TEXT'] = (filtered_df_icd['TEXT'] + ' ' + filtered_df_icd['CLINICAL_TEXT'])

In [261]:
filtered_df_icd['TEXT'][0]

'unspecified septicemia         discharge date              sex   m   medicine   admitted from rehabilitation for hypotension  systolic blood pressure to the   s  and decreased urine output    the patient is a    year old male who had been hospitalized at the   from   through   of   after undergoing a left femoral at bypass graft and was subsequently discharged to a rehabilitation facility        coronary artery disease with diffuse   vessel disease  right dominant  status post proximal left circumflex stent in   with occlusion of the distal left circumflex  status post right coronary artery stent on    no percutaneous coronary intervention to     diagonal left circumflex      small proximal left anterior descending artery  or     small distal left anterior descending artery        congestive heart failure  with an ejection fraction of     to            type   diabetes with neuropathy       hypertension       diverticulosis  found on colonoscopy in          alzheimer s dementia       h

# Split the Data - ICD

In [262]:
# Training and test
tt_dict['X_train_icd'], tt_dict['X_test_icd'] , tt_dict['y_train_icd'], tt_dict['y_test_icd'] = \
train_test_split(filtered_df_icd['TEXT'].values, filtered_df_icd['ICD9_CODE'], test_size = .3, random_state = 42, shuffle=True)

# Test and Validation
tt_dict['X_test_icd'], tt_dict['X_val_icd'] , tt_dict['y_test_icd'], tt_dict['y_val_icd'] = \
train_test_split(tt_dict['X_test_icd'], tt_dict['y_test_icd'], test_size = .05, random_state = 42, shuffle=True)


# Balance the data

In [263]:
def oversample_df(X_train, y_train, percentile):
            
    # Recombine the training dataset
    x = pd.Series(X_train).reset_index(drop=True)
    y = pd.Series(y_train).reset_index(drop=True)
    training_df = pd.concat([x,y], axis=1, ignore_index=True)

    # Check counts
    df_cts = training_df.iloc[:,1].value_counts()
    record_ct = round(np.percentile(df_cts, percentile))
    print('New Balanced Record Count per feature: {}'.format(record_ct))
    
    # Create a list of CPT values
    df = list(df_cts.index.values)

    # Resample
    minority_df = []
    for i in df:
        test_resampled = resample(training_df[training_df.iloc[:,1] == i], replace=True, n_samples=record_ct, random_state=123)
        minority_df.append(test_resampled)
    
    # Create final dataframe
    new_df = pd.concat(minority_df)
    
    return new_df

def balance(tt_dict, sections):
    for section in sections:
        print(section)
        
        # Running balance function
        training_balanced = oversample_df(tt_dict['X_train_' + section], tt_dict['y_train_' + section] , 99)
        
        # Reassign balanced data
        tt_dict['X_train_' + section] = np.array(training_balanced.iloc[:,0].values)
        tt_dict['y_train_' + section] = np.array(training_balanced.iloc[:,1].values)

# Run the functions
balance(tt_dict, sections)

icd
New Balanced Record Count per feature: 2372


# Tokenize the data

In [264]:
# Define stop words
my_stop_words = list(set(stopwords.words('english'))) \
                + ['admission', 'date', 'sex'] \
                + ['needed', 'every', 'seen', 'weeks', 'please', 'ml', 'unit', 'small', 'year', 'old', 'cm', 'non', 'mm', 'however']
                # Got the above from my top 100 most predictive words that I wanted to remove

# Set Dictionary for vectorized words
vectorized_words = {}
    
def vectorize_df(train_test_dict, sections):

    for section in sections:
        
        # Import TfidfVectorizer
        vectorized_words['tfidf_vectorizer_' + section] = TfidfVectorizer(stop_words=my_stop_words, max_df=.7, min_df = 2, sublinear_tf = True, ngram_range = (1, 2))

        # Transform the training data
        tfidf_train = vectorized_words['tfidf_vectorizer_' + section].fit_transform(train_test_dict['X_train_' + section])

        # Transform the test data
        tfidf_test = vectorized_words['tfidf_vectorizer_' + section].transform(train_test_dict['X_test_' + section])

        # Add results to dictionary

        vectorized_words['tfidf_train_' + section] = tfidf_train
        vectorized_words['tfidf_test_' + section] = tfidf_test

vectorize_df(tt_dict, sections)   


# Logistic Regression

In [265]:
from sklearn.linear_model import LogisticRegression

models_lr = {}

def run_clf(train_test_dict, sections):

    # Fit and check accuracy
    for section in sections:
        
        # Use Naive Bayes model
        clf = LogisticRegression(random_state=123, C=1, max_iter=25, solver='sag', n_jobs=-1)
        
        # Fit model
        clf.fit(vectorized_words['tfidf_train_' + section], train_test_dict['y_train_' + section])
        
        # Save in dictionary
        models_lr[section] = clf
    
run_clf(tt_dict, sections)

# Evaluation

In [266]:
# models_ls = [models_nb, models_rf, models_xg, models_knn, models_lr, models_tree]
models_ls = [models_lr]

# Check accuracy
def predictions(tt_dict, models, section=[]):
    for key, model in models.items():
        if len(section) == 0 or key == section:
            pred = model.predict(vectorized_words['tfidf_test_' + key])
            print(key)
            print(metrics.accuracy_score(tt_dict['y_test_' + key], pred))

predictions(tt_dict, models_lr)

icd
0.9605072463768116


# Classification Report

In [267]:
# Create classification report taken from here: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
print('Test')
section = 'icd'
pred = models_lr[section].predict(vectorized_words['tfidf_test_' + section])
print(classification_report(tt_dict['y_test_' + section], pred))

print('Training')
pred_x = models_lr[section].predict(vectorized_words['tfidf_train_' + section])
print(classification_report(tt_dict['y_train_' + section], pred_x))

Test
              precision    recall  f1-score   support

        0389       0.97      0.99      0.98       573
       41071       0.91      0.94      0.92       480
       41401       0.98      0.93      0.96      1012
        4241       0.92      0.98      0.95       320
       V3001       1.00      1.00      1.00       375

    accuracy                           0.96      2760
   macro avg       0.96      0.97      0.96      2760
weighted avg       0.96      0.96      0.96      2760

Training
              precision    recall  f1-score   support

        0389       1.00      1.00      1.00      2372
       41071       1.00      1.00      1.00      2372
       41401       1.00      0.99      1.00      2372
        4241       1.00      1.00      1.00      2372
       V3001       1.00      1.00      1.00      2372

    accuracy                           1.00     11860
   macro avg       1.00      1.00      1.00     11860
weighted avg       1.00      1.00      1.00     11860



# Make Predictions

In [291]:
def predict_cpt(text, section):
    input_text_clean = clean_data(pd.Series(text), False)
    print(input_text_clean)
    tfidf_input_test = vectorized_words['tfidf_vectorizer_' + section].transform(input_text_clean)
    clf = models_lr[section]
    
    clf.predict(tfidf_input_test)
    print(clf.predict(tfidf_input_test), str(round(max(clf.predict_proba(tfidf_input_test)[0]) * 100,2)) + '%')
#     print(clf.predict_proba(tfidf_input_test)[0])

# Set Variables
txt_record_no = 0
section = 'icd'
# section_text = 'Anesthesia'

# Run Function
predict_cpt(x, section)

# print(np.array(tt_dict['y_val_' + section_text])[txt_record_no])

0                     discharge date               ...
dtype: object
['41071'] 74.95%


In [290]:
row = 7000
# filtered_df_icd = filtered_df_icd.reset_index()
print(filtered_df_icd['ICD9_CODE'][row])
x = filtered_df_icd['CLINICAL_TEXT'][row]
x

41071


'                 discharge date                      sex    m  medicine  penicillins   iodine  iodine containing   chief complaint  nstemi  cardiac catheterization  diagnostic  tooth extraction        mr    is a    y o male with a h o rheumatic heart disase  severe as  htn  and hyperlipidemia who presented initially to an osh ed with severe sob and was then transferred to the   ed when there was a concern for possible stemi  of note  the pt recently had left carotid stenting on    post procedure  he was instructed to keep his bp          taking sudafed at home if his bp decreased  about     days after his procedure  he noted increasing sob and doe  he denied any cp  he presented to   ed with the above complaints  he was started on a nitro gtt and given combivent nebs  lasix    iv x    solumedrol     mg iv x    ativan   mg iv x    morphine   mg iv x    asa     mg po  and lopressor   mg iv x    because of his respiratory distress  he was placed on bipap       a non contrast ct scan was 

In [279]:
filtered_df_icd

Unnamed: 0,SUBJECT_ID,HADM_ID,CLINICAL_TEXT,ICD9_CODE,TEXT
0,3,145834.0,discharge date sex m ...,0389,unspecified septicemia discharge date ...
1,33,176176.0,discharge date patient is an ...,0389,unspecified septicemia discharge date ...
2,85,112077.0,discharge date ...,0389,unspecified septicemia discha...
3,111,155897.0,discharge date ...,0389,unspecified septicemia discha...
4,112,173177.0,discharge date medicine...,0389,unspecified septicemia discha...
...,...,...,...,...,...
15082,32588,161808.0,admission date discharge date date o...,V3001,single liveborn born in hospital delivered b...
15083,32596,112009.0,discharge date sex f...,V3001,single liveborn born in hospital delivered b...
15084,32603,182063.0,admission date discharge date date o...,V3001,single liveborn born in hospital delivered b...
15085,32641,168336.0,discharge date sex f...,V3001,single liveborn born in hospital delivered b...


# Look at the Most/Least Predictive Features

In [234]:
def get_features(sections):

    # Initialize dataframe list
    df_ls = []
    
    for section in sections: 
    
        # Loop through for each class
        for index, class_ in enumerate(models_lr[section].classes_):
            
            print('Class ' + str(index) + ' & Section: ' + section)

            # Get the feature names
            feature_names = vectorized_words['tfidf_vectorizer_' + section].get_feature_names()

            # Get the probabilities
            # Source: # https://sebastiansauer.github.io/convert_logit2prob/ for converting odds to log odds
            probs = [np.exp(x)/(1 + np.exp(x)) for x in models_lr[section].coef_[index]]

            # Zip together the first CPT weights with feature names
            feat_with_weights =  sorted(zip(probs, feature_names))
            feat_with_weights_r = feat_with_weights[::-1]

            # Bottom 100 dataframe
            bottom_100 = pd.DataFrame(feat_with_weights[:100], columns = ['Prob','Features'])
            bottom_100['Class'] = models_lr[section].classes_[index]
            bottom_100['Direction'] = 'Bottom'
            bottom_100['Section'] = section

            # Top 100 dataframe
            top_100 = pd.DataFrame(feat_with_weights_r[:100], columns = ['Prob','Features'])
            top_100['Class'] = models_lr[section].classes_[index]
            top_100['Direction'] = 'Top'
            top_100['Section'] = section

            # Add dataframes to list
            df_ls.append(bottom_100)
            df_ls.append(top_100)

    return pd.concat(df_ls)

features_df = get_features(sections)

Class 0 & Section: Evaluation and management
Class 1 & Section: Evaluation and management
Class 2 & Section: Evaluation and management
Class 3 & Section: Evaluation and management
Class 4 & Section: Evaluation and management
Class 0 & Section: Surgery
Class 1 & Section: Surgery
Class 2 & Section: Surgery
Class 3 & Section: Surgery
Class 4 & Section: Surgery
Class 0 & Section: Medicine
Class 1 & Section: Medicine
Class 2 & Section: Medicine
Class 3 & Section: Medicine
Class 4 & Section: Medicine
Class 0 & Section: Radiology
Class 1 & Section: Radiology
Class 2 & Section: Radiology
Class 3 & Section: Radiology
Class 4 & Section: Radiology
Class 0 & Section: other
Class 1 & Section: other
Class 2 & Section: other
Class 3 & Section: other
Class 0 & Section: icd
Class 1 & Section: icd
Class 2 & Section: icd
Class 3 & Section: icd
Class 4 & Section: icd


In [244]:
# Save the features
features_df.to_csv('features_df.csv')

# Save the Models

In [153]:
# Taken from here: https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
# save the model to disk

for key, model in models_lr.items():
    filename = 'finalized_model_' + key + '.sav'
    pickle.dump(model, open(filename, 'wb'))

#     # some time later...

#     # load the model from disk
#     loaded_model = pickle.load(open(filename, 'rb'))
#     result = loaded_model.score(X_test, Y_test)
#     print(result)

# Save the Fitted Vectorizers

In [239]:
for key, vectorizer in vectorized_words.items():
    if 'vectorizer' in key:
        filename = 'finalized_vectorizer_' + key + '.sav'
        pickle.dump(vectorizer, open(filename, 'wb'))

# Save the Validation Files

In [248]:
for key, value in tt_dict.items():
    if 'X_val_' in key or 'y_val_' in key:
        pd.Series(value).to_csv(key + '.csv')