<a href="https://colab.research.google.com/github/aminrabinia/Text_Classification_test/blob/main/Text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from tensorflow import keras
import pandas as pd

In [None]:
xlsx_path = "/content/NLP Take-Home Data (2).xlsx"
data = pd.ExcelFile(xlsx_path)
df = data.parse('MLAL Challenge')
df.head()

Unnamed: 0,chf cmplnt,A/P,icd10encounterdiagcode,icd10encounterdiagdescr
0,Right Hip pain,"Normal hip, slight valgus alignment both hips",M25.551,M25.551: Pain in right hip
1,F/U RT foot pain and stiffness/ 18 weeks post ...,RIGHT Bunion 2nd HT + Plantar plate tear (MTP)...,M20.5X1,M20.5X1: Other deformities of toe(s) (acquired...
2,"Bilateral Hand pain, Bilateral Shoulder pain",Bilateral Carpal Tunnel Syndrome Right Subacro...,M75.41,M75.41: Impingement syndrome of right shoulder
3,Finger pain left thumb,Left thumb ulnar collateral ligament injury Pl...,S53.32XA,S53.32XA: Traumatic rupture of left ulnar coll...
4,MRI results LT foot,Lt peroneal tendon tear,S93.602A,"S93.602A: Unspecified sprain of left foot, ini..."


In [None]:
labels = []
for x in df['icd10encounterdiagcode'][:-1]:
  labels.append(str(x[:3]))
print(labels[:10])

['M25', 'M20', 'M75', 'S53', 'S93', 'S92', 'M16', 'M75', 'S93', 'M06']


In [None]:
cat_labels = set(labels)
cat_labels = list(cat_labels)
categorized_labels = []
for label in labels:
  categorized_labels.append(cat_labels.index(label))
print(categorized_labels[:10])

[121, 87, 90, 31, 11, 29, 130, 90, 11, 110]


In [None]:
texts = []
for i in range(len(df['A/P'])-1):
  desc = df['icd10encounterdiagdescr'][i]
  texts.append(df['A/P'][i] + " " + desc[3:])
print(texts[:10])

['Normal hip, slight valgus alignment both hips .551: Pain in right hip', 'RIGHT Bunion 2nd HT + Plantar plate tear (MTP) [>> Rt distal bunion w/ 2nd CPR reconstruction] .5X1: Other deformities of toe(s) (acquired), right foot', 'Bilateral Carpal Tunnel Syndrome Right Subacromial Impingement Plan Right endoscopic carpal tunnel release Continue vimovo Will call with treatment plan regarding shoulder once we receive results of the MRI from Banner RTC 2 wks post op for right endoscopic carpal tunnel release .41: Impingement syndrome of right shoulder', 'Left thumb ulnar collateral ligament injury Plan Continue left thumb spica splint at all times even at night Light duty at work with 2lb lifting restriction RTC in 4 wks for exam .32XA: Traumatic rupture of left ulnar collateral ligament, initial encounter', 'Lt peroneal tendon tear .602A: Unspecified sprain of left foot, initial encounter', '22yoM with right foot nailgun injury and open 1st-3rd metatarsal fxs and retained foreign body s/p

In [None]:
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = categorized_labels

In [None]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
print('Length of trian', len(train_y))
print('Length of valid', len(valid_y))

Length of trian 3914
Length of valid 1305


In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip

In [None]:
!unzip wiki-news-300d-1M.vec.zip

In [None]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label, epochs= 10)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
def create_rnn_lstm():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=keras.optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rnn_lstm()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("RNN-LSTM, Word Embeddings",  accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
RNN-LSTM, Word Embeddings 0.007662835249042145


In [None]:
mytext = "Plan --okay for activity as tolerated --continue gout management --fu PRN"
seq_x = sequence.pad_sequences(token.texts_to_sequences([mytext]), maxlen=70)
output = classifier.predict(seq_x)
mylabel = cat_labels[int(output[0])]
print("Predicted ICD10:", mylabel)

Predicted ICD10: M11
