# Intact Medical Data ML model using Naive Bayes Classification Model

### By: Daniyal, Hibah, Abhishek and Adam

In our Data Hackathon project, we were given medical transcription data by Intact to 

### Step 1: Import libraries and read in the data

We'll add more libraries, as we move on.

In [71]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import _stop_words
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

df = pd.read_csv("new_train.csv", index_col=0)
print("Test size with duplicates: ", len(df))
# Get rid of duplicate transcriptions
df = df.drop_duplicates(subset=['transcription'])
print("Test size without duplicates: ", len(df))
df

Test size with duplicates:  3969
Test size without duplicates:  2255


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/daniyalmohammed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/daniyalmohammed/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/daniyalmohammed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/daniyalmohammed/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,medical_specialty,transcription,labels
0,Emergency Room Reports,"REASON FOR THE VISIT:, Very high PT/INR.,HIST...",0
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Acetabular fracture ...",1
2,Surgery,"NAME OF PROCEDURE,1. Selective coronary angio...",1
3,Radiology,"REFERRING DIAGNOSIS: , Motor neuron disease.,P...",2
4,Emergency Room Reports,"CHIEF COMPLAINT: , Dental pain.,HISTORY OF PRE...",0
...,...,...,...
3957,Orthopedic,"PREOPERATIVE DIAGNOSIS: , Right hip osteoarthr...",6
3959,Surgery,"PREOPERATIVE DIAGNOSIS: , Left knee medial fem...",1
3975,Surgery,"DELIVERY NOTE: , The patient is a very pleasan...",1
3977,Urology,"PREOPERATIVE DX: , Stress urinary incontinence...",20


In [72]:
# # Further process the dataframe

# # Get rid of duplicate transcriptions
# # df = df.drop_duplicates(subset=['transcription'])
# print("Test size without duplicates: ", len(df))

# start_patterns_soci = ["SOCIAL HISTORY:,", "SOCIAL HISTORY: ,", "SOCIAL HISTORY:  ,", "SOCIAL HISTORY:   ,", "SOCIAL HISTORY,", "SOCIAL HISTORY ,", "SOCIAL HISTORY:"]
# start_patterns_fam = ["FAMILY HISTORY:,", "FAMILY HISTORY: ,", "FAMILY HISTORY:  ,", "FAMILY HISTORY:   ,", "FAMILY HISTORY,", "FAMILY HISTORY ,", "FAMILY HISTORY:"]
# start_patterns = [start_patterns_soci, start_patterns_fam]

# def remove_hist(transcription):
#     for start_patterns_list in start_patterns:
#         # Find the start position of the block of text
#         start = -1
#         dummy = 0
#         for pattern in start_patterns_list:
#             start = transcription.find(pattern)
#             if start != -1:
#                 dummy = len(pattern)
#                 break
    
#         # Check if the start position is valid
#         if start != -1:
#             # Find the end position of the block of text
#             end = transcription.find(",", start+dummy)
#             if end == -1:
#                 end = len(transcription)
        

#             # Extract the parts of the string that come before and after the block of text
#             before = transcription[:start]
#             after = transcription[end+1:]
            
#             # Join the remaining parts of the string
#             new_string = before + after
#             transcription = new_string
#         else:
#             continue
#     return transcription

# count = 0
# for row in df.iloc():
#     count += 1
#     # print(row['transcription'], '\n')
#     transcription = remove_hist(row['transcription'])
#     print(transcription, '\n')
# print("Test size: ", count)

### Step 2: Pre-process our data

I think this is the most important step here, the ML model is only as good as its dataset, so we gotta make sure it's squeaky clean.

All of the basic pre-processing is done by the CountVectorizer, these tasks include:
- Tokenize (divide words individually)
- Remove stop-words (remove "the, and, to, or, ..."; other special characters)
- Lemmatize (convert similar words into its base root; eating, eats, ate => eat)

In [73]:
# Create labels/target values
y = df.labels
print("Label size: ", len(y))
y

Label size:  2255


0        0
1        1
2        1
3        2
4        0
        ..
3957     6
3959     1
3975     1
3977    20
3999     1
Name: labels, Length: 2255, dtype: int64

In [74]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["transcription"], y, test_size=0.2, random_state=42)

# X_train: training data of features
print("X_train size: ", len(X_train))
# y_train: training data of label
print("y_train size: ", len(y_train))

# X_test: test data of features
print("X_test size: ", len(X_test))
# y_test: test data of label
print("y_test size: ", len(y_test))

# X_train
# y_train[:50]
# X_test
# y_test

X_train size:  1804
y_train size:  1804
X_test size:  451
y_test size:  451


In [75]:
# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# we could try stemming as well

with open("words_alpha.txt") as word_file:
    english_words = set(word.strip().lower() for word in word_file)
with open("medical_terms.txt") as word_file:
    medical_words = set(word.strip().lower() for word in word_file)
def is_english_word(word):
    return ((word.lower() in english_words) or (word.lower() in medical_words))

# Custom pre-processing function
def preprocess_data(text):
    text = text.lower()
    text = re.sub(r'\d|_', '', text) # removes words with digits and '_'
    text = wordnet_lemmatizer.lemmatize(text)
    return text

# , preprocessor=preprocess_data
# Initialize a CountVectorizer object
count_vectorizer = CountVectorizer(stop_words="english", preprocessor=preprocess_data, max_df=0.3, min_df=21, ngram_range=(1, 2))

print(type(count_vectorizer))

print(wordnet_lemmatizer.lemmatize("strawberries"))

<class 'sklearn.feature_extraction.text.CountVectorizer'>
strawberry


### Step 3: Fit and Transform the Data

Specifically, we must fit AND transform the feature training data and only transform the feature test data.
This is a preliminary step.

In fit_transform(), what happens is that we calculate the mean and variance of the training data and standardize the entire dataset (hence, transform). We only need transform() for the test data because we are using the mean and variance of the training data to standardize the test data.

In [76]:
# Fit and transform the TRAINING data using only the 'transciption' column values
count_train = count_vectorizer.fit_transform(X_train.values)
# Transform the TEST data using only the 'transciption' column values
count_test = count_vectorizer.transform(X_test.values)


# Print number of words processing
print("Number of words: ", len(count_vectorizer.get_feature_names_out())) # number of test data from split
# Print the features (individual tokens) of the count_vectorizer
print(count_vectorizer.get_feature_names_out()[:500])



Number of words:  3713
['abc' 'abcd' 'abcd general' 'abdomen' 'abdomen pelvis' 'abdomen prepped'
 'abdomen soft' 'abdominal' 'abdominal cavity' 'abdominal pain'
 'abdominal wall' 'ability' 'able' 'abnormal' 'abnormalities'
 'abnormality' 'abscess' 'absent' 'abuse' 'ac' 'access' 'accessory'
 'accident' 'accommodation' 'accompanied' 'accomplished' 'according' 'ace'
 'achieve' 'achieved' 'acid' 'active' 'activities' 'activity' 'actually'
 'acute' 'acute distress' 'adaptic' 'add' 'added' 'addition' 'additional'
 'additionally' 'addressed' 'adenocarcinoma' 'adenoidectomy' 'adenopathy'
 'adequate' 'adequate general' 'adequately' 'adherent' 'adhesions'
 'adjacent' 'administered' 'administered patient' 'administration'
 'admission' 'admit' 'admits' 'admitted' 'admitted hospital' 'admitting'
 'adnexa' 'adnexal' 'adrenal' 'adult' 'advair' 'advanced' 'advised'
 'afebrile' 'affect' 'aforementioned' 'african' 'african american'
 'afternoon' 'age' 'aggressive' 'ago' 'agree' 'agreed' 'ahead' 'aid' 'a

### Step 4: Train our models here

We used the Multinomial Naive Bayes to classify our labels

In [77]:
# Instantiate a Multinomial Naive Bayes classifier
nb_clf = MultinomialNB(alpha=0.4)
# Fit the classifier to the training data
nb_clf.fit(count_train, y_train)
# Create the predicted tags
pred = nb_clf.predict(count_test)

# Print the predictions for each row of the dataset (1001 rows)
print("Number of predictions: ", len(pred)) # Equal to the number of test data (when it got split)
print(pred)

Number of predictions:  451
[20 16  1  1 28  7 15 16 20  5 25 10 27 22  1  6 16  0 19 11 16  1 13  7
 16 13 10 34  1 20  7 16 29  5 13 10 11 16 13 15 21  2 15  6  1  9 10 15
 13 34 20  2  1  6  6  1  0  6  4  0 23 10  7 19  6  2  1 16 25 27  6 19
  7  1  1  7  2  4 27  6 16  2  7  0  6 18 18 21 16 10 20  9 19 27 10 16
 13  4  4 29 16 27 16  5  1  1  4 16  5  2 27  1 15 27 30  0  9 31 16 20
  6  7  1 25 20 19 16 19 10 18  7  6  7  0 27 10  1 16  6  1  7 15 21 16
  1  7 16 16  1  1  1  9  5  7  2  3  9 16 30 19  6  1  6 16 15 22  6  7
 29 29  2 18 27  5 10  5  5 16  6 16  2 21 13 25  6 34  7 15 19 19  1  1
 13 19 21  1 37  3 10  6 19  1 35 21  6  1 39  2 16  8  2  2  1  1 27 10
 16 16  2 34 19  9  6  1  0 21 16  7  6 11 16 16  1  6  5 21 10  1 21 10
 13 32  3 22  1  1  6 34  1  4 27 34  7 14 37 11  1 19  5  5  2  2 34 18
 13  2  2 16  2  6 16  1 16  9 11  8  1  1 26  2  2 13  9 13  1 13 10 10
  7 16  9  3 16 27  5 16 37 16  0  9  0 16 28 13  3  2  4  1 10  1 13 21
  4 34 21 16  7  2 19  

### Step 4: Evaluate the model

We will create an accuracy score and also a confusion matrix.

Precision = TP/(TP + FP)

Recall = TP/(TP+FN)

F1 Score = 2*(Recall * Precision) / (Recall + Precision)

In [80]:
# Instantiate a Multinomial Naive Bayes classifier
nb_clf = MultinomialNB(alpha=0.45) # best: 0.4
# Fit the classifier to the training data
nb_clf.fit(count_train, y_train)
# Create the predicted tags
pred = nb_clf.predict(count_test)

# Print the predictions for each row of the dataset (1001 rows)
#print("Number of predictions: ", len(pred)) # Equal to the number of test data (when it got split)
#print(pred)


# Calculate the accuracy score
score = metrics.accuracy_score(y_test, pred)
# Calculate the confusion matrix
# conf_matrix = metrics.confusion_matrix(y_test, pred)


#print(score)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.33      0.50      0.40         6
           1       0.46      0.27      0.34        93
           2       0.40      0.44      0.42        27
           3       0.25      0.40      0.31         5
           4       0.50      0.35      0.41        17
           5       0.43      0.45      0.44        20
           6       0.51      0.49      0.50        39
           7       0.48      0.35      0.41        40
           8       0.00      0.00      0.00         5
           9       0.69      0.82      0.75        11
          10       0.00      0.00      0.00         7
          11       0.29      0.25      0.27         8
          12       0.00      0.00      0.00         5
          13       0.21      0.31      0.25        13
          14       0.00      0.00      0.00         0
          15       0.50      0.88      0.64         8
          16       0.45      0.40      0.42        62
          18       0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
