In [1]:
#Download and unzip files
!wget https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/p3jkppwr29-1.zip
!unzip p3jkppwr29-1.zip
!unzip UVigoMED.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: UVigoMED/single_label/train/17860.json  
  inflating: UVigoMED/single_label/train/17953.json  
  inflating: UVigoMED/single_label/train/18281.json  
  inflating: UVigoMED/single_label/train/18284.json  
  inflating: UVigoMED/single_label/train/18309.json  
  inflating: UVigoMED/single_label/train/18360.json  
  inflating: UVigoMED/single_label/train/18654.json  
  inflating: UVigoMED/single_label/train/18931.json  
  inflating: UVigoMED/single_label/train/88890.json  
  inflating: UVigoMED/single_label/train/84429.json  
  inflating: UVigoMED/single_label/train/89061.json  
  inflating: UVigoMED/single_label/train/84568.json  
  inflating: UVigoMED/single_label/train/89181.json  
  inflating: UVigoMED/single_label/train/89190.json  
  inflating: UVigoMED/single_label/train/89197.json  
  inflating: UVigoMED/single_label/train/84732.json  
  inflating: UVigoMED/single_label/train/84743.json  
  inflating: UVig

In [2]:
#import all required packages
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from glob import glob
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
#Read Training data and convert it to pandas dataframe
train_file_list = glob('/content/UVigoMED/single_label/train/*.json')
df_train = pd.DataFrame()
dfs = []
for file in train_file_list:
  data = pd.read_json(file, lines=True)
  dfs.append(data)

df_train = pd.concat(dfs, ignore_index=True)

In [4]:
#Read test data
test_file_list = glob('/content/UVigoMED/single_label/test/*.json')
df_test = pd.DataFrame()
dfs_test = []
for file in test_file_list:
  data_test = pd.read_json(file, lines=True)
  dfs_test.append(data_test)

df_test = pd.concat(dfs_test, ignore_index=True)

In [5]:
df_train.shape

(43972, 8)

In [6]:
df_test.shape

(10873, 8)

In [7]:
df_train = df_train[['abstract', 'categories']]
df_test = df_test[['abstract', 'categories']]

In [8]:
df_data = df_train.append(df_test, ignore_index=True)

In [9]:
df_data.shape

(54845, 2)

In [10]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(df_data['abstract'])):
    review = re.sub('[^a-zA-Z]', ' ', df_data['abstract'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    short_word = ' '.join(word for word in review if len(word)>1)
    long_word = ' '.join(word for word in short_word.split() if len(word)<15)
    #review = ' '.join(review)
    corpus.append(long_word)

In [11]:
### Vocabulary size
voc_size=15000

In [12]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
#onehot_repr

In [13]:
sent_length=200
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[    0     0     0 ... 12170  5872  1961]
 [    0     0     0 ...  2103  2906  8328]
 [    0     0     0 ...  1610 11102 13973]
 ...
 [    0     0     0 ...  3404 13777  4096]
 [    0     0     0 ... 11824 12466  8719]
 [    0     0     0 ... 14678 11019   128]]


In [None]:
print(set(df_train['categories']))

{'Otorhinolaryngologic Diseases', 'Digestive System Diseases', 'Endocrine System Diseases', 'Cardiovascular Diseases', 'Wounds and Injuries', 'Neoplasms', 'Congenital Hereditary and Neonatal Diseases and Abnormalities', 'Occupational Diseases', 'Hemic and Lymphatic Diseases', 'Animal Diseases', 'Respiratory Tract Diseases', 'Immune System Diseases', 'Chemically-Induced Disorders', 'Parasitic Diseases', 'Skin and Connective Tissue Diseases', 'Nutritional and Metabolic Diseases', 'Musculoskeletal Diseases', 'Bacterial Infections and Mycoses', 'Eye Diseases', 'Disorders of Environmental Origin', 'Pathological Conditions and Signs and Symptoms', 'Nervous System Diseases', 'Female Urogenital Diseases and Pregnancy Complications', 'Virus Diseases', 'Male Urogenital Diseases', 'Stomatognathic Diseases'}


In [14]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y = le.fit_transform(df_data['categories']).reshape(-1,1)

In [21]:
## Creating model
embedding_vector_features=50
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(200))
model.add(Dropout(0.3))
model.add(Dense(26,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 50)           750000    
_________________________________________________________________
dropout_4 (Dropout)          (None, 200, 50)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 200)               200800    
_________________________________________________________________
dropout_5 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 26)                5226      
Total params: 956,026
Trainable params: 956,026
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
len(embedded_docs),y.shape

(54845, (54845, 1))

In [16]:

X_final=np.array(embedded_docs)


In [17]:
X_train = X_final[:43972]
X_test = X_final[43972:]
y_train = y[:43972]
y_test = y[43972:]

In [22]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fbe124c7fd0>

We are getting almost 72 % validation accuracy with 93 % accuracy on training data using LSTM. Although this can be enhance after tuning the model.
