# Changes from v2

* Subset to just the 4 CPT codes that the model was predicting:   99291, 99232, 94003, 99233 
* Ideas for next improvement:
    * Check for class imbalance in v2
    * Check for class imbalance in v3
    * Text cleaning
    * Try count vectors
   

# Import the MIMIC data

In [1]:
import gzip
import pandas as pd
import glob

dataset_dictionary = {}

for file_path in glob.glob('.\\Data\\MIMIC Files\*'):
    file_name = file_path.split('\\')[3].split('.')[0]
    with gzip.open(file_path, mode='r') as file:
        dataset_dictionary[file_name] = pd.read_csv(file)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Join the tables

In [2]:
# Create dataset to join together -----

# Create note_events table -----

# Combine text for each subject and encounter
note_events_base = dataset_dictionary['NOTEEVENTS'][dataset_dictionary['NOTEEVENTS'].loc[:,'CATEGORY'] == 'Discharge summary']
note_events = note_events_base.groupby(['SUBJECT_ID', 'HADM_ID'], as_index=False)['TEXT'].agg(sum)

# Create CPT table -----

cpt_events_base = dataset_dictionary['CPTEVENTS'].loc[:, ['SUBJECT_ID','HADM_ID', 'CPT_CD']]
cpt_events = cpt_events_base.drop_duplicates()

# Join the datasets -----

note_cpt = note_events.merge(cpt_events, on = ['SUBJECT_ID','HADM_ID'])
# print(note_cpt.shape, note_events.shape, cpt_events.shape) # (223,150, 4) (52,726, 3) (227,510, 3)

# Filter the data to CPT with over 1000 instances

In [3]:
# Find CPT codes occurring over 1000 times and put into a list
cpt_1000 = note_cpt['CPT_CD'].astype(str).value_counts(ascending=False) >= 1000
cpt_1000_ls = list(cpt_1000[cpt_1000].index)

note_cpt_1000 = note_cpt[note_cpt['CPT_CD'].astype(str).isin(cpt_1000_ls)]

In [45]:
note_cpt_1000[note_cpt_1000['HADM_ID'] == 145834]
# .loc[0,:]

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,CPT_CD
0,3,145834.0,Admission Date: [**2101-10-20**] Discharg...,94002
1,3,145834.0,Admission Date: [**2101-10-20**] Discharg...,94003


# Filter the data to just 4 CPT codes: 99291, 99232, 94003, 99233

In [90]:
note_cpt_4 = note_cpt[note_cpt['CPT_CD'].astype(str).isin(['99291', '99232', '94003'])]

# Split the Data

In [91]:
# Import Packages -----

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the data -----

X_train, X_test, y_train, y_test = train_test_split(note_cpt_4['TEXT'], note_cpt_4['CPT_CD'].astype(str), test_size = .33, random_state = 42)


# Tokenize the data

In [92]:
# Tokenize the data -----

# Import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df = .7)

# Transform the training data
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
tfidf_test = tfidf_vectorizer.transform(X_test)

# print(tfidf_vectorizer.get_feature_names()[:10])

# Run Naive Bayes

In [93]:
# Use Naive Bayes model -----

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

nb_classifier = MultinomialNB()

# Fit and check accuracy
nb_classifier.fit(tfidf_train, y_train)
pred = nb_classifier.predict(tfidf_test)

# Classification Report

In [96]:
# Create classification report taken from here: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
print(classification_report(y_test, pred,target_names=class_labels))


              precision    recall  f1-score   support

       94003       0.76      0.03      0.06      5456
       99232       0.33      0.15      0.21      7937
       99291       0.41      0.86      0.56      8546

    accuracy                           0.40     21939
   macro avg       0.50      0.35      0.27     21939
weighted avg       0.47      0.40      0.31     21939



# Looking at Feature Names and Weights

In [95]:
class_labels = nb_classifier.classes_
# len(class_labels) # 24

# len(nb_classifier.coef_) # 24
# (nb_classifier.coef_[0] ==  nb_classifier.coef_[1]) # They are unique to each CPT code, but are for the same features

# len(feature_names) # 207,838

# Zip together the first CPT weights with feature names
# feat_with_weights =  sorted(zip(nb_classifier.coef_[0], feature_names))

# # Class label with weights
# print(class_labels[0])

# for i in range(100):
#     print(i, feat_with_weights[-i])

# # Print dataframe # ran into memory error
# # (tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
# # tfidf_df.head()
# import numpy as np

# np.exp(1)**-8.5
# # Taken from here: * https://stackoverflow.com/questions/61586946/how-to-calculate-feature-log-prob-in-the-naive-bayes-multinomialnb

In [94]:
metrics.accuracy_score(y_test, pred)

# """
# V1 NLP Model Accuracy: 0.117
# Wow, I've got a long way to go to improve accuracy
# V2 NLP Model Accuracy: 0.14
# V3 NLP Model Accuracy: .40
# """

# Confusion matrix 
# confusion_mtrx = metrics.confusion_matrix(y_test.astype(str), pred) # 1380, 1380
# confusion_mtrx

0.3991977756506678