In [2]:

import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/biomedical-text-publication-classification/alldata_1_for_kaggle.csv


In [3]:
import torch
import torch.optim as opt
import torch.nn as nn
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader,Dataset
from sklearn.model_selection import train_test_split

In [4]:
df=pd.read_csv('/kaggle/input/biomedical-text-publication-classification/alldata_1_for_kaggle.csv',encoding='latin1')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,0,a
0,0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,4,Thyroid_Cancer,This study aimed to investigate serum matrix ...


In [6]:
df['0'].unique()

array(['Thyroid_Cancer', 'Colon_Cancer', 'Lung_Cancer'], dtype=object)

In [7]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [8]:
df.rename(columns={'0':'class','a':'text'},inplace=True)

In [9]:
df.shape

(7570, 2)

In [10]:
df.isnull().sum()

class    0
text     0
dtype: int64

In [11]:
def tokenize(text):
    return text.lower().split()

vocab=Counter()

for sentence in df['text']:
    vocab.update(tokenize(sentence))
vocab={word:idx+1 for idx,(word,_) in enumerate(vocab.most_common())}

In [12]:
def encoded_sentences(sentence,vocab):
    return [vocab.get(word,0) for word in tokenize(sentence)]

encoded_sentences=[encoded_sentences(sentence,vocab) for sentence in df['text']]

max_len=max(len(seq) for seq in encoded_sentences)
encoded_sentences=[seq+[0] * (max_len-len(seq)) for seq in encoded_sentences]

le = LabelEncoder()
encoded_labels = le.fit_transform(df['class'])
encoded_labels = np.array(encoded_labels)
encoded_labels = encoded_labels.astype(np.float32)


In [13]:
x_train,x_test,y_train,y_test=train_test_split(encoded_sentences,encoded_labels,test_size=0.2,random_state=42)

len(x_train),len(x_test)

(6056, 1514)

In [14]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)


class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        for b in self.dl: 
            yield to_device(b, self.device)
    def __len__(self):
        return len(self.dl)

In [15]:
class SampleDataset(Dataset):
    def __init__(self,sentences,labels):
        self.sentences=torch.tensor(sentences,dtype=torch.long)
        self.labels=torch.tensor(labels,dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self,idx):
        return self.sentences[idx],self.labels[idx]


train_data=SampleDataset(x_train,y_train)
test_data=SampleDataset(x_test,y_test)

train_loader=DataLoader(train_data,batch_size=2,shuffle=True)
test_loader=DataLoader(test_data,batch_size=2)

In [16]:
device=get_default_device()
device

device(type='cuda')

In [17]:
train_loader=DeviceDataLoader(train_loader,device)
teat_loader=DeviceDataLoader(test_loader,device)


In [18]:
class CancerClassifierRNN(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim):
        super(CancerClassifierRNN,self).__init__()
        self.embedding=nn.Embedding(vocab_size,embedding_dim)
        self.rnn=nn.RNN(embedding_dim,hidden_dim,batch_first=True)
        self.fc=nn.Linear(hidden_dim,output_dim)

    def forward(self,x):
        embadded=self.embedding(x)
        _,hidden=self.rnn(embadded)
        output=self.fc(hidden.squeeze(0))
        return torch.sigmoid(output)

In [19]:
vocab_size=len(vocab)+1
embedding_dim=50
hidden_dim=64
output_dim=3

model=CancerClassifierRNN(vocab_size,embedding_dim,hidden_dim)
criterion=nn.CrossEntropyLoss()
optimizer=opt.Adam(model.parameters(),lr=0.001)

In [20]:
model=to_device(model,device)

In [21]:
from tqdm.notebook import tqdm

n_epochs = 5
for epoch in range(n_epochs):
    model.train()
    epoch_loss = 0
    epoch_correct = 0
    total_samples = 0

    for sentences, labels in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(sentences)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()


    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")

  0%|          | 0/3028 [00:00<?, ?it/s]

Epoch 1, Loss: 1.0971


  0%|          | 0/3028 [00:00<?, ?it/s]

Epoch 2, Loss: 1.0957


  0%|          | 0/3028 [00:00<?, ?it/s]

Epoch 3, Loss: 1.0970


  0%|          | 0/3028 [00:00<?, ?it/s]

Epoch 4, Loss: 1.0954


  0%|          | 0/3028 [00:00<?, ?it/s]

Epoch 5, Loss: 1.0919


In [64]:
model.eval()
all_preds = []
all_labels = []
predicted_classes=[]
true_labels= []
with torch.no_grad():
    for sentences, labels in test_loader:
        sentences = sentences.to(device)
        labels = labels.to(device)
        # print(labels)
        # break

        outputs = model(sentences)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        class_labels = le.inverse_transform(preds.cpu().numpy())
        predicted_classes.extend(class_labels)
        labels = le.inverse_transform(labels.cpu().numpy())

        true_labels.extend(labels)

In [65]:
torch.save(model.state_dict(), "cancerclassifier.pth")


In [66]:
df1=pd.DataFrame({'predicted_classes':predicted_classes,'original_class':true_labels})
    

In [67]:
df1

Unnamed: 0,predicted_classes,original_class
0,Colon_Cancer,Colon_Cancer
1,Colon_Cancer,Colon_Cancer
2,Thyroid_Cancer,Thyroid_Cancer
3,Colon_Cancer,Colon_Cancer
4,Colon_Cancer,Lung_Cancer
...,...,...
1509,Thyroid_Cancer,Thyroid_Cancer
1510,Colon_Cancer,Colon_Cancer
1511,Thyroid_Cancer,Colon_Cancer
1512,Thyroid_Cancer,Colon_Cancer


In [80]:
# torch.tensor(torch.sum(df1['predicted_classes']==df1['original_class']).item()/len(df1['predicted_classes']))

In [79]:
sum(df1['predicted_classes']==df1['original_class'])/len(df1['predicted_classes'])

0.3725231175693527