In [133]:
#Import Necessary Libraries
import string
from collections import Counter
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
import torchtext
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [124]:
#read data
df = pd.read_csv("/kaggle/input/symptom2disease/Symptom2Disease.csv")
df.drop("Unnamed: 0",inplace=True,axis=1)
df

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."
...,...,...
1195,diabetes,I'm shaking and trembling all over. I've lost ...
1196,diabetes,"Particularly in the crevices of my skin, I hav..."
1197,diabetes,I regularly experience these intense urges and...
1198,diabetes,"I have trouble breathing, especially outside. ..."


In [125]:
# set of English stopwords we will remove from our text data
stop_words = set(stopwords.words('english'))

In [126]:
def clean_text(sent):
    #remove punctuations
    sent = sent.translate(str.maketrans('','',string.punctuation)).strip()
    
    #remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sent)
    words = [word for word in words if word not in stop_words]
    
    return " ".join(words).lower()

In [127]:
# clean text rows in dataframe
df["text"] = df["text"].apply(clean_text)

In [128]:
# get list of diseases in our dataset
diseases = df["label"].unique()

# helper dictionaries to convert diseases to index and vice versa
idx2dis = {k:v for k,v in enumerate(diseases)}
dis2idx = {v:k for k,v in idx2dis.items()}

In [129]:
# convert disease name to index (label encoding)
df["label"] = df["label"].apply(lambda x: dis2idx[x])

In [130]:
# Split the data into train,test set
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

In [131]:
# pytorch dataset object use index to return item, so need to reset non-continuoues index of divided dataset
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [132]:
# max number of words in symptoms descriptions (cleaned version)
max_words = X_train.apply(lambda x:x.split()).apply(len).max()
max_words

31

In [134]:
# create vocabulart using torchtext vocab class
counter = Counter()
for text in X_train:
    counter.update(text.split())

vocab = torchtext.vocab.vocab(counter,specials=['<unk>', '<pad>'])

In [135]:
# set default index as unknown token
vocab.set_default_index(vocab['<unk>'])

In [136]:
# Create a PyTorch dataset`
class DiseaseDataset(torch.utils.data.Dataset):
    def __init__(self, symptoms,labels):
        self.symptoms = symptoms
        self.labels= torch.tensor(labels.to_numpy())
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.symptoms[idx]
        label = self.labels[idx]

        # Convert the text to a sequence of word indices
        text_indices = [vocab[word] for word in text.split()]
        
        # padding for same length sequence
        if len(text_indices)<max_words:
            text_indices = text_indices + [1]*(max_words - len(text_indices))
        
        return torch.tensor(text_indices), label

In [137]:
# instantiate dataset objects
train_dataset = DiseaseDataset(X_train, y_train)
val_dataset = DiseaseDataset(X_test, y_test)

In [156]:
# choose batch size, will start from smaller values as we got smaller dataset
batch_size = 8

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [165]:
# Define the RNN model
class RNNModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,num_classes,drop_prob,num_layers=1,bidir=False,seq="lstm"):
        super(RNNModel, self).__init__()
        self.seq = seq
        self.bidir_f = 2 if bidir else 0
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        if seq=="lstm":
            self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim,
                                     num_layers=num_layers,
                                     batch_first=True,
                                     bidirectional=bidir)
        else:
            self.rnn = torch.nn.GRU(embedding_dim, hidden_dim,
                                 num_layers=num_layers,
                                 batch_first=True,
                                bidirectional=bidir)
        
        self.dropout = torch.nn.Dropout(drop_prob) #dropout layer
        self.fc = torch.nn.Linear(hidden_dim*self.bidir_f, num_classes) # fully connected layer

    def forward(self, text_indices):
        # Embed the text indices
        embedded_text = self.embedding(text_indices)
#         print("EMB SHAPE: ",embedded_text.shape)

        # Pass the embedded text through the RNN
        rnn_output,hidden_states = self.rnn(embedded_text)
        # Take the last output of the RNN
        last_rnn_output = rnn_output[:, -1, :]
        x = self.dropout(last_rnn_output)
        # Pass the last output of the RNN through the fully connected layer
        x = self.fc(x)

        # Return the final output
        return x

In [158]:
def train(model,num_epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    #choose device for training
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.cuda()
    print("IS CUDA: ",next(model.parameters()).is_cuda)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for data in train_loader:
            inputs,labels = data 
            inputs,labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            acc = (labels == outputs.argmax(dim=-1)).float().mean().item()
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            val_loss = 0.0
            correct = 0
            total = 0
            for inputs, labels in val_loader:
                inputs,labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                predicted = outputs.argmax(-1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = (labels == outputs.argmax(dim=-1)).float().mean().item()
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {val_loss}, Train Accuracy: {acc:.2f}  Val Accuracy: {accuracy:.2f}')

In [159]:
num_classes = len(np.unique(y_train))
vocab_size = len(vocab)
emb_dim = 256
hidden_dim = 128
drop_prob = 0.4

In [160]:
model_lstm = RNNModel(vocab_size,emb_dim,hidden_dim,num_classes,drop_prob,num_layers=3,bidir=True, seq="lstm")

In [161]:
train(model_lstm,35)

IS CUDA:  True
Epoch [1/35], Loss: 93.29036211967468, Train Accuracy: 0.38  Val Accuracy: 0.00
Epoch [2/35], Loss: 76.39439451694489, Train Accuracy: 0.00  Val Accuracy: 0.25
Epoch [3/35], Loss: 66.39456450939178, Train Accuracy: 0.12  Val Accuracy: 0.50
Epoch [4/35], Loss: 65.37175726890564, Train Accuracy: 0.12  Val Accuracy: 0.38
Epoch [5/35], Loss: 51.4245787858963, Train Accuracy: 0.38  Val Accuracy: 0.62
Epoch [6/35], Loss: 47.295558512210846, Train Accuracy: 0.62  Val Accuracy: 0.75
Epoch [7/35], Loss: 42.337356209754944, Train Accuracy: 0.50  Val Accuracy: 0.62
Epoch [8/35], Loss: 38.51057821512222, Train Accuracy: 0.75  Val Accuracy: 0.75
Epoch [9/35], Loss: 32.002619467675686, Train Accuracy: 0.88  Val Accuracy: 0.75
Epoch [10/35], Loss: 25.300126127898693, Train Accuracy: 0.88  Val Accuracy: 0.75
Epoch [11/35], Loss: 30.557545766234398, Train Accuracy: 0.88  Val Accuracy: 0.88
Epoch [12/35], Loss: 24.620021793991327, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [13/35], Lo

In [162]:
model_gru = RNNModel(vocab_size,emb_dim,hidden_dim,num_classes,drop_prob,num_layers=1,bidir=True,seq="gru")

In [163]:
train(model_gru,20)

IS CUDA:  True
Epoch [1/20], Loss: 89.13558173179626, Train Accuracy: 0.25  Val Accuracy: 0.25
Epoch [2/20], Loss: 62.976693868637085, Train Accuracy: 0.50  Val Accuracy: 0.75
Epoch [3/20], Loss: 37.7839440703392, Train Accuracy: 0.88  Val Accuracy: 0.88
Epoch [4/20], Loss: 21.78274303674698, Train Accuracy: 1.00  Val Accuracy: 0.88
Epoch [5/20], Loss: 15.550272047519684, Train Accuracy: 1.00  Val Accuracy: 0.88
Epoch [6/20], Loss: 11.757794301956892, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [7/20], Loss: 11.023748081177473, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [8/20], Loss: 10.226082149893045, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [9/20], Loss: 9.451268069446087, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [10/20], Loss: 9.404732123017311, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [11/20], Loss: 9.367515539750457, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [12/20], Loss: 8.969753273762763, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [13/20], Los

In [215]:
def make_pred(model,text):
    text = clean_text(text)
    # Convert the text to a sequence of word indices
    text_indices = [vocab[word] for word in text.split()]
        
    # padding for same length sequence
    if len(text_indices)<max_words:
        text_indices = text_indices + [1]*(max_words - len(text_indices))
    text_indices = torch.tensor(text_indices).cuda()
    pred = model(text_indices.unsqueeze(0))

    print(idx2dis[pred.argmax(1).item()])

In [216]:
symp2 = "I've been itching a lot, and it's been accompanied with a rash that looks to be getting worse over time. \
There are also some patches of skin that are different colours from the rest of the skin,\
as well as some lumps that resemble little nodes."

make_pred(model_lstm, symp2)

Fungal infection


Though perfect evaluation score, not reliable due to very small dataset.  

  
**TODO**
- Hyperparameter Tuning/Search
- Test on real data
- Experiment with/without stopwords removal
- Use other methods of vectorization (Glove/Word2Vec embeddings)