# Changes from v10
* Using sentence tokenize as an initial pass in order to try and filter to the most pertinent information


# Import the MIMIC data

In [38]:
import gzip
import pandas as pd
import glob

dataset_dictionary = {}

for file_path in glob.glob('.\\Data\\MIMIC Files\*'):
    file_name = file_path.split('\\')[3].split('.')[0]
    with gzip.open(file_path, mode='r') as file:
        dataset_dictionary[file_name] = pd.read_csv(file)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Join the tables

In [39]:
# Create dataset to join together -----

# Create note_events table -----

# Combine text for each subject and encounter
note_events_base = dataset_dictionary['NOTEEVENTS'][dataset_dictionary['NOTEEVENTS'].loc[:,'CATEGORY'] == 'Discharge summary']
note_events = note_events_base.groupby(['SUBJECT_ID', 'HADM_ID'], as_index=False)['TEXT'].agg(sum)

# Create CPT table -----

cpt_events_base = dataset_dictionary['CPTEVENTS']
cpt_events_base = cpt_events_base[cpt_events_base['TICKET_ID_SEQ'] == 1]
cpt_events_base = cpt_events_base.loc[:, ['SUBJECT_ID','HADM_ID', 'CPT_CD']]
cpt_events = cpt_events_base.drop_duplicates()
cpt_events

# Join the datasets -----

note_cpt = note_events.merge(cpt_events, on = ['SUBJECT_ID','HADM_ID'])
# print(note_cpt.shape, note_events.shape, cpt_events.shape) # (223,150, 4) (52,726, 3) (227,510, 3)

# Filter the data to CPT over 200 samples + Resample

In [153]:
import numpy as np
from sklearn.utils import resample

# Value Counts
print(note_cpt['CPT_CD'].astype(str).value_counts())

# Filter to CPT with over 200 notes
df = note_cpt['CPT_CD'].astype(str).value_counts()
top_200 = list((df[df > 200]).index.values)
note_cpt_4 = note_cpt[note_cpt['CPT_CD'].astype(str).isin(top_200)]

# Resample minority groups -----

# Remove largest group
top_200.remove('99291')
'99291' in top_200

# minority_ls = top_200
minority_ls = ['99223','99222','99254']

minority_df = []
for i in minority_ls:
    test_resampled = resample(note_cpt[note_cpt['CPT_CD'].astype(str) == i], replace=True, n_samples=7860, random_state=123)
    minority_df.append(test_resampled)

minority_df.append(note_cpt[note_cpt['CPT_CD'].astype(str) == '99291'])
new_df = pd.concat(minority_df)

new_df['CPT_CD'] = new_df['CPT_CD'].astype(str)

99291    7860
99223    2851
99222    1736
99254    1242
99255     882
         ... 
62272       1
50547       1
53215       1
39561       1
63082       1
Name: CPT_CD, Length: 707, dtype: int64


# Check for Imbalance

In [154]:
# import matplotlib.pyplot as plt

# plt.hist(note_cpt['CPT_CD'].astype(str))
# plt.show()

new_df['CPT_CD'].value_counts()

99291    7860
99222    7860
99254    7860
99223    7860
Name: CPT_CD, dtype: int64

# Sentence tokenizer to restructure dataframe

In [155]:
from nltk.tokenize import sent_tokenize

# pd.concat([test['column 1'].str.split(',', expand=True), test['predictive column']], axis=1).melt(id_vars='predictive column').drop('variable', axis=1)

# Source: https://stackoverflow.com/questions/33098040/how-to-use-word-tokenize-in-data-frame

def expand_dataframe_by_sent(dataframe, text_col_name, pred_col_name):
    
    # # Step 1: Remove all commas
    dataframe[text_col_name] = dataframe[text_col_name].str.replace(',',' ')

    # # Step 2: Sentence tokenize and convert into a large string
    dataframe[text_col_name] = dataframe.apply(lambda row: sent_tokenize(row[text_col_name]), axis=1)

    # # Step 3 + 4: Split into columns & concatenate with original data
    updated_df = pd.concat([dataframe[text_col_name].astype(str).str.split(',', expand=True), dataframe[pred_col_name]], axis=1).melt(id_vars=pred_col_name, value_name=text_col_name).drop('variable', axis=1)
    
    updated_df.dropna(inplace=True)
    
    return updated_df

new_df_1 = expand_dataframe_by_sent(new_df, 'TEXT', 'CPT_CD')
new_df_1

Unnamed: 0,CPT_CD,TEXT
0,99223,['Admission Date: [**2155-4-15**] ...
1,99223,['Admission Date: [**2173-5-7**] ...
2,99223,['Admission Date: [**2176-9-22**] ...
3,99223,['Admission Date: [**2173-5-18**] ...
4,99223,['Admission Date: [**2116-5-14**] ...
...,...,...
19076261,99291,'The office number is [**Telephone/Fax (1) 49...
19107701,99291,'2.'
19139141,99291,'Follow up appointment with Dr. [**First Name...
19170581,99291,'Office phone number is [**Telephone/Fax (1) ...


# Filter the data

In [158]:
import string

def clean_data(text_series):
    
    # Replace \n 
    text_series = text_series.str.replace('\\n',' ', regex=True)    

    # Remove dates and locations
    text_series = text_series.str.replace('\[\*\*(.*?)\*\*\]', ' ', regex=True)
    
#     # Remove topics
#     data = text_series.str.split('([A-Z\s]+:)')
#     for row_num, value in enumerate(data):
#         text_chunks = [x.strip().replace(':','').replace('\n', '') for x in value]
#         for i, x in enumerate(text_chunks):
#             if 'MEDICATION' in x or 'SOCIAL HISTORY' in x or 'FAMILY HISTORY' in x:
#                 text_chunks[i] = ' '
#                 try:
#                     text_chunks[i + 1] = ' '
#                 except:
#                     continue

#         text_series.iloc[row_num] = ' '.join(text_chunks)
    
    # Replace punctuation
    text_series = text_series.str.replace('[' + string.punctuation + ']', ' ', regex=True)
    
    # Convert to lowercase 
    text_series = text_series.str.lower()
    
    # Remove all digits
    text_series = text_series.str.replace('\d',' ', regex=True)
    
    # Replace plurals, endings with ing, endings with ed, endings with ly
    text_series = text_series.str.replace('s(?=\s)', ' ', regex=True)
    text_series = text_series.str.replace('ing(?=\s)', ' ', regex=True)
    text_series = text_series.str.replace('ed(?=\s)', ' ', regex=True)
    text_series = text_series.str.replace('ly(?=\s)', ' ', regex=True)
    
    return text_series

# Update Text Column

new_df_1.loc[:, 'TEXT'] = clean_data(new_df_1['TEXT']).values

# Shuffle the Data

In [160]:
new_df_1 = new_df_1.sample(n = len(new_df_1), random_state = 42)

# Split the Data

In [200]:
# Import Packages -----

from nltk.corpus import stopwords

my_stop_words = list(set(stopwords.words('english'))) \
                + ['admission', 'date', 'sex'] \
                + ['needed', 'every', 'seen', 'weeks', 'please', 'ml', 'unit', 'small', 'year', 'old', 'cm', 'non', 'mm', 'however']
                # Got the above from my top 100 most predictive words that I wanted to remove

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the data -----

X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(new_df_1['TEXT'].values, new_df_1['CPT_CD'].astype(str), range(len(new_df_1['CPT_CD'])), test_size = .8, random_state = 42)

# Tokenize the data

In [201]:
# Tokenize the data -----

# Import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=my_stop_words, min_df = 3, max_df = .7, sublinear_tf=True)

# Transform the training data
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
tfidf_test = tfidf_vectorizer.transform(X_test)

# Run Naive Bayes

In [202]:
# Use Naive Bayes model -----

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

nb_classifier = MultinomialNB(alpha=.7)

# Fit and check accuracy
nb_classifier.fit(tfidf_train, y_train)
pred = nb_classifier.predict(tfidf_test)

# Remove Rows That Didn't Predict Correctly

In [215]:
misclassified = []

# Make a list of indices in y_test that contain the misclassified images
for index, value in enumerate(pred):
    if value != y_test.values[index]:
        misclassified.append(index)




new_df_2 = new_df_1.copy(deep=True)


In [207]:
misclassified

[0,
 1,
 3,
 4,
 7,
 9,
 10,
 11,
 14,
 15,
 17,
 18,
 22,
 24,
 25,
 27,
 29,
 30,
 31,
 32,
 33,
 35,
 36,
 38,
 40,
 41,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 54,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 69,
 71,
 74,
 75,
 77,
 80,
 81,
 82,
 83,
 85,
 86,
 87,
 89,
 93,
 95,
 96,
 97,
 98,
 99,
 101,
 102,
 103,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 113,
 114,
 119,
 121,
 122,
 123,
 124,
 125,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 140,
 141,
 144,
 145,
 146,
 147,
 148,
 149,
 152,
 153,
 154,
 156,
 157,
 159,
 161,
 162,
 163,
 164,
 166,
 167,
 169,
 172,
 173,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 184,
 185,
 187,
 189,
 190,
 194,
 195,
 196,
 197,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 208,
 209,
 210,
 211,
 212,
 213,
 214,
 215,
 216,
 217,
 220,
 221,
 223,
 224,
 225,
 226,
 227,
 228,
 229,
 230,
 231,
 233,
 234,
 235,
 237,
 239,
 240,
 241,
 243,
 244,
 246,
 247,
 249,
 250,
 251,
 255,
 256,
 259,
 26

In [None]:
# # misclassified
drop_list = []

for i in misclassified:
    drop_list.append(index_test[i])

# new_df_3 = new_df_2.reindex(index_test)

drop_list

count = 0
# new_df_2.drop([2903707], axis=0)
for i in drop_list:
    try:
        new_df_1.drop(i, axis=0)
        count += 1
    except:
        continue
count

In [None]:
pd.DataFrame

In [191]:
new_df_2.reindex([3829675, 555380])

Unnamed: 0,CPT_CD,TEXT
3829675,,
555380,99254.0,hypercholesterolemia


# Tune NB Model

In [18]:
import numpy as np


def hyperparam_tuning(tfidf_train, y_train, tfidf_test, y_test, nb_classifier):
    for i in np.arange(0,1.1,.1):
        nb_classifier = MultinomialNB()
        nb_classifier.fit(tfidf_train, y_train)
        pred = nb_classifier.predict(tfidf_test)
        print(i)
        print(metrics.accuracy_score(y_test, pred))

hyperparam_tuning(tfidf_train, y_train, tfidf_test, y_test, nb_classifier)  

# Looks like .6-.7 are the best alpha

0.0
0.421076621541547
0.1
0.421076621541547
0.2
0.421076621541547
0.30000000000000004
0.421076621541547
0.4
0.421076621541547
0.5
0.421076621541547
0.6000000000000001
0.421076621541547
0.7000000000000001
0.421076621541547
0.8
0.421076621541547
0.9
0.421076621541547
1.0
0.421076621541547


# Run Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression

clf_logist = LogisticRegression(C=.001, random_state = 42, multi_class = 'multinomial', penalty='l2')
clf_logist.fit(tfidf_train, y_train)
logist_pred = clf_logist.predict(tfidf_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Looking at Feature Names and Weights

In [167]:
# Notes
# sum([np.exp(1)** x for x in nb_classifier.coef_[0]]) # The probability of all the words equals one
# # Taken from here: * https://stackoverflow.com/questions/61586946/how-to-calculate-feature-log-prob-in-the-naive-bayes-multinomialnb

# ------------------------------------------

import numpy as np

def get_feature_rank(tfidf_vectorizer, y_no, nb_classifier):
    
    # Get the feature names
    feature_names = tfidf_vectorizer.get_feature_names()

    # Zip together the first CPT weights with feature names
    feat_with_weights =  sorted(zip(nb_classifier.coef_[y_no], feature_names))
    
    # Print words most responsible for the prediction
#     print('Top 100 \n\n\n\n')
#     top_100_ls = []
    for i in range(100):
        x = feat_with_weights[-i-1]
#         top_100_ls.append(x[1])
#         print(nb_classifier.classes_[y_no], i, round((np.exp(1) ** x[0]),4), x[1])

#     print('\n\n\n\n Bottom 100 \n\n\n\n')
    for i in range(100):
        x = feat_with_weights[i]
#         print(nb_classifier.classes_[y_no], i, round((np.exp(1) ** x[0]),4), x[1])
    
#     min_weight = min([i[0] for i in feat_with_weights])
    
    x = [i[0] for i in feat_with_weights]
    
    median_pred = np.median(x)
          
    return [i[1] for i in feat_with_weights if i[0] <= median_pred] # Minimum weight words
#     return top_100_ls

# Find the least predictive words
def least_pred_words(nb_classifier, tfidf_vectorizer):
    low_wt_stop_ls = []

    for i in range(len(nb_classifier.classes_)):
        low_wt_stop_ls += get_feature_rank(tfidf_vectorizer, i, nb_classifier)

    low_wt_stop_ls = list(set(low_wt_stop_ls))
    return low_wt_stop_ls
    
low_wt_stop_ls = least_pred_words(nb_classifier, tfidf_vectorizer)

# Find top 100 words - doesn't seem to improve the model
def highest_pred_words(nb_classifier, tfidf_vectorizer):
    top_100_ls = []
    for i in range(len(nb_classifier.classes_)):
        top_100_ls += get_feature_rank(tfidf_vectorizer, i, nb_classifier)

    top_100_ls = list(set(top_100_ls))
    return top_100_ls

['citizenship',
 'inutbag',
 'reamin',
 'taspe',
 'traycan',
 'eleveat',
 'interscapular',
 'stenossis',
 'spp',
 'tct',
 'resectable',
 'throacotomy',
 'evisiting',
 'nannulu',
 'tplt',
 'arthrotec',
 'asberger',
 'nnor',
 'sttw',
 'inconssitent',
 'contemplate',
 'whofelt',
 'nreporttemporal',
 'femaile',
 'nbengay',
 'synoviti',
 'flie',
 'controls',
 'navalide',
 'nlipomatosi',
 'endeavour',
 'tractor',
 'expectorated',
 'tegretal',
 'nvalproate',
 'walnut',
 'ncarie',
 'fulminan',
 'nrole',
 'shariro',
 'duckbill',
 'nviridan',
 'supression',
 'mwt',
 'pseudobulbar',
 'propecia',
 'nqpm',
 'kindey',
 'narrowed',
 'maex',
 'inheritable',
 'speculum',
 'intracran',
 'otsc',
 'chornical',
 'nvetriculography',
 'nhycodan',
 'abcces',
 'hypotenion',
 'blank',
 'nfile',
 'herniorrhaphie',
 'nklonipin',
 'ntelapivir',
 'sfebrile',
 'nafld',
 'nkiss',
 'falumouth',
 'hematoemesi',
 'wrote',
 'nspecgr',
 'npostoop',
 'repond',
 'nnwh',
 'reliever',
 'nresultant',
 'nmeasures',
 'digibind',

# Update stop words and tokenize again

In [96]:
my_stop_words += low_wt_stop_ls

tfidf_vectorizer = TfidfVectorizer(stop_words=my_stop_words, min_df = 3, max_df = .7, sublinear_tf=True)

# Transform the training data
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
tfidf_test = tfidf_vectorizer.transform(X_test)


# Create Vocab with top words and tokenize again

In [None]:
# It reduced test accuracy back to 43% and training went from 50% to 44%

tfidf_vectorizer = TfidfVectorizer(vocabulary=top_100_ls, stop_words=my_stop_words, min_df = 3, max_df = .7, sublinear_tf=True)

# Transform the training data
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
tfidf_test = tfidf_vectorizer.transform(X_test)


# Run Naive Bayes again

In [97]:
# Use Naive Bayes model -----

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

nb_classifier = MultinomialNB(alpha=.7)

# Fit and check accuracy
nb_classifier.fit(tfidf_train, y_train)
pred = nb_classifier.predict(tfidf_test)

# Classification Report

In [164]:
# Create classification report taken from here: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
from sklearn.metrics import classification_report

print('Test')
class_labels = nb_classifier.classes_
print(classification_report(y_test, pred,target_names=class_labels))

print('Training')
pred_x = nb_classifier.predict(tfidf_train)
print(classification_report(y_train, pred_x,target_names=class_labels))


Test


KeyboardInterrupt: 

In [203]:
metrics.accuracy_score(y_test, pred)

# """
# V1 NLP Model Accuracy: 0.117
# Wow, I've got a long way to go to improve accuracy
# V2 NLP Model Accuracy: 0.14
# V3 NLP Model Accuracy: .40
# """

# Confusion matrix 
# confusion_mtrx = metrics.confusion_matrix(y_test.astype(str), pred) # 1380, 1380
# confusion_mtrx

0.3441613031558106

In [39]:
# Logistical Model accuracy
metrics.accuracy_score(y_test, logist_pred)
# .39



0.3560782168740599

# Vectorize Test

In [84]:
from sklearn.feature_extraction.text import CountVectorizer

vocab = ['love', 'happy', 'run']
count_vectorizer = CountVectorizer(vocabulary = vocab)
x = count_vectorizer.fit_transform(['happy', 'run', 'run', 'run'])
print(x)

  (0, 1)	1
  (1, 2)	1
  (2, 2)	1
  (3, 2)	1


# Splitting out a list in a DataFrame

In [198]:
import pandas as pd

test = pd.DataFrame({'column 1':['The other day, I saw a bear, A great big bear',2,3], 'predictive column':[1,2,3]} )
test['column 1'].str.split(',', expand=True).drop(labels=[1,2], axis=0)
# pd.concat([test['column 1'].str.split(',', expand=True), test['predictive column']], axis=1).melt(id_vars='predictive column').drop('variable', axis=1)

Unnamed: 0,0,1,2
0,The other day,I saw a bear,A great big bear


In [197]:
?pd.DataFrame.drop