In [2]:
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
import torch
import logging
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.ERROR)

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [4]:
data1= pd.read_csv('/kaggle/input/train-dataaa/train.csv')
data2= pd.read_csv('/kaggle/input/dataset1/Distill_Try.csv')

In [5]:
data1.head()

Unnamed: 0,Id,Title,Abstract,Categories
0,9707,Axiomatic Aspects of Default Inference,This paper studies axioms for nonmonotonic con...,['cs.LO']
1,24198,On extensions of group with infinite conjugacy...,We characterize the group property of being wi...,['math.GR']
2,35766,An Analysis of Complex-Valued CNNs for RF Data...,Recent deep neural network-based device classi...,"['cs.LG', 'cs.IT', 'eess.SP', 'math.IT']"
3,14322,On the reconstruction of the drift of a diffus...,The problem of reconstructing the drift of a d...,"['math.PR', 'math.ST', 'stat.TH']"
4,709,Three classes of propagation rules for GRS and...,"In this paper, we study the Hermitian hulls of...","['cs.IT', 'math.IT']"


In [6]:
data2.head()

Unnamed: 0,Id,Text,Tag,Tags
0,9707,axiomatic aspects default inference \n axioms ...,['cs.LO'],[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0...
1,24198,"extensions group infinite conjugacy classes, i...",['math.GR'],[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
2,35766,analysis complex-valued cnns rf data-driven wi...,"['cs.LG', 'cs.IT', 'eess.SP', 'math.IT']",[0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0...
3,14322,reconstruction drift diffusion from transition...,"['math.PR', 'math.ST', 'stat.TH']",[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
4,709,three classes propagation rules grs egrs codes...,"['cs.IT', 'math.IT']",[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0...


In [7]:
data = pd.DataFrame()
data['text'] = data1['Title'] + ". " + data1['Abstract']


In [12]:
data['text'] = data['text'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [18]:
data['labels'] = data2['Tags']

In [33]:
import ast

In [37]:
def string_to_array(label_string):
    cleaned_string = label_string.strip('[]').split()
    return np.array([int(num) for num in cleaned_string])

In [38]:
data['labels'] = data['labels'].apply(string_to_array)

In [39]:
type(data['labels'][0])

numpy.ndarray

In [40]:
# Your data preprocessing steps...

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)


In [41]:
class MultiLabelDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        ids = inputs['input_ids'].squeeze()
        mask = inputs['attention_mask'].squeeze()
        token_type_ids = inputs["token_type_ids"].squeeze()

        return {
            'ids': ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


In [42]:
# Splitting data into train and test
train_size = 0.8
train_data, test_data = train_test_split(data, test_size=1-train_size, random_state=200)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)


In [43]:
train_data

Unnamed: 0,text,labels
0,deployable reinforcement learning with variabl...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,biphasic face photo-sketch synthesis via seman...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,sequential information guided sensing. we stud...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3,feature hashing for large scale multitask lear...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,a schlichtness theorem for envelopes of holomo...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
40963,orthogonal layers of parallelism in large-scal...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
40964,private set intersection: a multi-message symm...,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, ..."
40965,compilation of hpsg to tag. we present an impl...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
40966,global well-posedness for kdv in sobolev space...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [44]:
MAX_LEN = 350
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 25
LEARNING_RATE = 1e-05

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [45]:
testing_loader

<torch.utils.data.dataloader.DataLoader at 0x7fe72f1bfbb0>

In [46]:
class RoBERTaClass(torch.nn.Module):
    def __init__(self):
        super(RoBERTaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.linear1 = torch.nn.Linear(768, 512)
        self.dropout = torch.nn.Dropout(0.1)
        self.linear2 = torch.nn.Linear(512, 256)
        self.leaky_relu = torch.nn.LeakyReLU()
        self.linear3 = torch.nn.Linear(256, 64)
        self.tanh = torch.nn.Tanh()
        self.classifier = torch.nn.Linear(64, 57)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1.last_hidden_state
        pooler = hidden_state[:, 0]

        linear1_output = self.linear1(pooler)
        linear1_output = self.dropout(linear1_output)

        linear2_output = self.linear2(linear1_output)
        linear2_output = self.leaky_relu(linear2_output)

        linear3_output = self.linear3(linear2_output)
        linear3_output = self.leaky_relu(linear3_output)

        output = self.classifier(linear3_output)
        return output

model = RoBERTaClass()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [47]:

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)


In [48]:
def train(epoch):
    model.train()
    total_loss = 0
    
    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        outputs = model(ids, mask, token_type_ids)
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        total_loss += loss.item()

        if _ % 500 == 0:
            print(f'Epoch: {epoch}, Iteration: {_}, Loss: {loss.item()}')

        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}, Average Loss: {total_loss / len(training_loader)}')

    # Save the model after every epoch
    torch.save(model.state_dict(), f'roberta_model_epoch_{epoch+1}.pt')


In [None]:
# Training loop
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Iteration: 0, Loss: 0.6957668662071228


36it [00:38,  1.05s/it]