In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [3]:
train_df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [4]:
texts = train_df["TITLE"].values+": "+train_df["ABSTRACT"].values
labels = train_df.iloc[:,3:].values

In [5]:
texts[0]

"Reconstructing Subject-Specific Effect Maps:   Predictive models allow subject-specific inference when analyzing disease\nrelated alterations in neuroimaging data. Given a subject's data, inference can\nbe made at two levels: global, i.e. identifiying condition presence for the\nsubject, and local, i.e. detecting condition effect on each individual\nmeasurement extracted from the subject's data. While global inference is widely\nused, local inference, which can be used to form subject-specific effect maps,\nis rarely used because existing models often yield noisy detections composed of\ndispersed isolated islands. In this article, we propose a reconstruction\nmethod, named RSM, to improve subject-specific detections of predictive\nmodeling approaches and in particular, binary classifiers. RSM specifically\naims to reduce noise due to sampling error associated with using a finite\nsample of examples to train classifiers. The proposed method is a wrapper-type\nalgorithm that can be used

In [6]:
labels[0]

array([1, 0, 0, 0, 0, 0])

In [7]:
label_names = train_df.iloc[:,3:].columns
label_names

Index(['Computer Science', 'Physics', 'Mathematics', 'Statistics',
       'Quantitative Biology', 'Quantitative Finance'],
      dtype='object')

In [8]:
# Step 1: Prepare the data
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        labels = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(labels)
        }

In [9]:
# Step 2: Initialize BERT model
num_labels = len(label_names)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=num_labels,
                                                      problem_type="multi_label_classification")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Step 3: Create DataLoader
dataset = MultiLabelDataset(texts, labels, tokenizer, max_len=128)
data_loader = DataLoader(dataset, batch_size=16, num_workers=2)

In [11]:
dataset[1]

{'text': 'Rotation Invariance Neural Network:   Rotation invariance and translation invariance have great values in image\nrecognition tasks. In this paper, we bring a new architecture in convolutional\nneural network (CNN) named cyclic convolutional layer to achieve rotation\ninvariance in 2-D symbol recognition. We can also get the position and\norientation of the 2-D symbol by the network to achieve detection purpose for\nmultiple non-overlap target. Last but not least, this architecture can achieve\none-shot learning in some cases using those invariance.\n',
 'input_ids': tensor([  101,  9963,  1999, 10755, 28335, 15756,  2897,  1024,  9963,  1999,
         10755, 28335,  1998,  5449,  1999, 10755, 28335,  2031,  2307,  5300,
          1999,  3746,  5038,  8518,  1012,  1999,  2023,  3259,  1010,  2057,
          3288,  1037,  2047,  4294,  1999,  9530,  6767,  7630,  3508,  2389,
         15756,  2897,  1006, 13229,  1007,  2315, 23750,  9530,  6767,  7630,
          3508,  2389, 

In [None]:
# Step 4: Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

Epoch 1/3, Loss: 0.1801159530878067


In [1]:
# Step 5: Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)S
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.sigmoid(logits).cpu().numpy()
        predictions.extend(predicted_labels > 0.5)
        true_labels.extend(labels.numpy())

f1 = metrics.f1_score(true_labels, predictions, average='micro')
print(f'Micro F1 Score: {f1}')

NameError: name 'model' is not defined

In [None]:
# Step 5: Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.sigmoid(logits).cpu().numpy()
        predictions.extend(predicted_labels > 0.5)
        true_labels.extend(labels.numpy())

f1 = metrics.f1_score(true_labels, predictions, average='micro')
print(f'Micro F1 Score: {f1}')

In [None]:
torch.save(model, "./data/model.pth")