# Intact Medical Specialty Classification Model using NLP

### By: Daniyal, Hibah, Abhishek and Adam

### Step 1: Import libraries and read in the data

We'll add more libraries, as we move on.

In [162]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import _stop_words
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
import spacy
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

df = pd.read_csv("new_train.csv", index_col=0)
print("Test size with duplicates: ", len(df))
df = df.drop_duplicates(subset=['transcription'])
print("Test size without duplicates: ", len(df))
df

Test size with duplicates:  3969
Test size without duplicates:  2255


[nltk_data] Downloading package wordnet to /Users/adamyeo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/adamyeo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/adamyeo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adamyeo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,medical_specialty,transcription,labels
0,Emergency Room Reports,"REASON FOR THE VISIT:, Very high PT/INR.,HIST...",0
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Acetabular fracture ...",1
2,Surgery,"NAME OF PROCEDURE,1. Selective coronary angio...",1
3,Radiology,"REFERRING DIAGNOSIS: , Motor neuron disease.,P...",2
4,Emergency Room Reports,"CHIEF COMPLAINT: , Dental pain.,HISTORY OF PRE...",0
...,...,...,...
3957,Orthopedic,"PREOPERATIVE DIAGNOSIS: , Right hip osteoarthr...",6
3959,Surgery,"PREOPERATIVE DIAGNOSIS: , Left knee medial fem...",1
3975,Surgery,"DELIVERY NOTE: , The patient is a very pleasan...",1
3977,Urology,"PREOPERATIVE DX: , Stress urinary incontinence...",20


### Step 2: Pre-process our data

I think this is the most important step here, the ML model is only as good as its dataset, so we gotta make sure it's squeaky clean.

All of the basic pre-processing is done by the CountVectorizer, these tasks include:
- Tokenize (divide words individually)
- Remove stop-words (remove "the, and, to, or, ..."; other special characters)
- Lemmatize (convert similar words into its base root; eating, eats, ate => eat)

In [103]:
# Create labels/target values
y = df.labels
print("Label size: ", len(y))
y

Label size:  2255


0        0
1        1
2        1
3        2
4        0
        ..
3957     6
3959     1
3975     1
3977    20
3999     1
Name: labels, Length: 2255, dtype: int64

In [155]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["transcription"], y, test_size=0.1, random_state=53)

# X_train: training data of features
print("X_train size: ", len(X_train))
# y_train: training data of label
print("y_train size: ", len(y_train))

# X_test: test data of features
print("X_test size: ", len(X_test))
# y_test: test data of label
print("y_test size: ", len(y_test))

# X_train
# y_train[:50]
# X_test
# y_test

X_train size:  2029
y_train size:  2029
X_test size:  226
y_test size:  226


In [156]:
# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# we could try stemming as well

with open("words_alpha.txt") as word_file:
    english_words = set(word.strip().lower() for word in word_file)
with open("medical_terms.txt") as word_file:
    medical_words = set(word.strip().lower() for word in word_file)
def is_english_word(word):
    return ((word.lower() in english_words) or (word.lower() in medical_words))

# Custom pre-processing function
def preprocess_data(text):
    text = text.lower()
    text = re.sub(r'\d|_', '', text) # removes words with digits and '_'
    text = wordnet_lemmatizer.lemmatize(text)
    return text


# Initialize a CountVectorizer object
count_vectorizer = CountVectorizer(stop_words="english", preprocessor=preprocess_data) # work on more pre-processing

print(type(count_vectorizer))

print(wordnet_lemmatizer.lemmatize("strawberries"))

<class 'sklearn.feature_extraction.text.CountVectorizer'>
strawberry


### Step 3: Fit and Transform the Data

Specifically, we must fit AND transform the feature training data and only transform the feature test data.
This is a preliminary step.

In fit_transform(), what happens is that we calculate the mean and variance of the training data and standardize the entire dataset (hence, transform). We only need transform() for the test data because we are using the mean and variance of the training data to standardize the test data.

In [157]:
# Fit and transform the TRAINING data using only the 'transciption' column values
count_train = count_vectorizer.fit_transform(X_train.values)
# Transform the TEST data using only the 'transciption' column values
count_test = count_vectorizer.transform(X_test.values)


# Print number of words processing
print("Number of words: ", len(count_vectorizer.get_feature_names_out())) # number of test data from split
# Print the features (individual tokens) of the count_vectorizer
print(count_vectorizer.get_feature_names_out()[:500])

Number of words:  19582
['aa' 'ab' 'abadeedleedlebadle' 'abandoned' 'abandonment' 'abated'
 'abbreviated' 'abc' 'abcd' 'abcg' 'abciximab' 'abd' 'abdomen' 'abdominal'
 'abdominally' 'abdominis' 'abdominoplasty' 'abdominosacrocolpopexy'
 'abdominus' 'abds' 'abduct' 'abducted' 'abduction' 'abducto' 'abductor'
 'abductors' 'abductovalgus' 'abductus' 'aberrant' 'aberration' 'abf'
 'abg' 'abgs' 'abilify' 'abilities' 'ability' 'ablate' 'ablated'
 'ablation' 'ablative' 'able' 'abnormal' 'abnormalities' 'abnormality'
 'abnormally' 'abnormities' 'abo' 'abolish' 'abort' 'aborted' 'abortion'
 'abortions' 'abortive' 'abovementioned' 'abraded' 'abrading' 'abrasion'
 'abrasions' 'abraxane' 'abreast' 'abrogated' 'abrogation' 'abrupt'
 'abruptio' 'abruption' 'abruptly' 'abs' 'abscess' 'abscesses' 'absence'
 'absent' 'absolute' 'absolutely' 'absorb' 'absorbable' 'absorbables'
 'absorbing' 'absorption' 'abstain' 'abstinence' 'abstraction'
 'abstractions' 'abundant' 'abundantly' 'abuse' 'abused' 'abuser'


### Step 4: Train our models here

We used the Multinomial Naive Bayes to classify our labels

In [160]:
# Instantiate a Multinomial Naive Bayes classifier
nb_clf = MultinomialNB()
# Fit the classifier to the training data
nb_clf.fit(count_train, y_train)
# Create the predicted tags
pred = nb_clf.predict(count_test)

# Print the predictions for each row of the dataset (1001 rows)
print("Number of predictions: ", len(pred)) # Equal to the number of test data (when it got split)
print(pred)

Number of predictions:  226
[ 6  9 16  1  1 16 16  1 16 16  1 16  6  1 16 15  1 16  7  2  7  6  1 34
  2  1  1 16  6 16 16 16  1  7  1  1 14  6  1  1  1  4 16  5 16  1  2  1
  2  1  1 16  1 16  4  7 16  5  4  6 16  7 16 16  1 16 16 16 20  4 16  1
 18  1  6 16 16  4 16 16  1 16 16 19  4  1  1  7  6 16 16  2 16 15  1  1
  5 16  1 16 19 16  1  1 21 16 16  1  4 16  1  1  6 16  2 16  6  1 16  5
  1 20  7  7  6  1  1  2  1  6  6  1 16 16 16  6  2  1  1 16 15  7  5  2
  1 16  1  5  7  1 16  7 27 16  1  1  7 16 16  1  6  2 21  2 27  1  1  1
  7 21  1 16 16  6  1  1 16 16  1 16  2  1  1 16  1  1  9 16  1 16  7  1
 27  1  5  1  6  2  6  1 16  1  1 16  1 16 16 19  1 34 16 16  1 27 16 16
  6 21 16  1 16  1  1 16  5 16]


### Step 4: Evaluate the model

We will create an accuracy score and also a confusion matrix.

Precision = TP/(TP + FP)

Recall = TP/(TP+FN)

F1 Score = 2*(Recall * Precision) / (Recall + Precision)

In [159]:
# Calculate the accuracy score
score = metrics.accuracy_score(y_test, pred)
# Calculate the confusion matrix
# conf_matrix = metrics.confusion_matrix(y_test, pred)


print(score)
print(conf_matrix)
print(classification_report(y_test, pred))

0.35398230088495575
[[  0   0   0 ...   0   0   0]
 [  0 156   0 ...   0   0   0]
 [  0   5  16 ...   0   0   0]
 ...
 [  0   0   0 ...   1   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.53      0.63      0.58        60
           2       0.31      0.25      0.28        16
           3       0.00      0.00      0.00         2
           4       0.29      0.20      0.24        10
           5       0.50      0.50      0.50         8
           6       0.32      0.32      0.32        19
           7       0.50      0.41      0.45        17
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         9
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00         1
          13       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
