In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import shutil
import sys

In [2]:
train_path = "C:\\Sowmya\\Personal\\PYTORCH\\Pytorch_stuffs\\BERT\\Multi Label Text Classification using BERT PyTorch\\train.csv"

In [3]:
df = pd.read_csv(train_path)
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [4]:
df.columns

Index(['ID', 'TITLE', 'ABSTRACT', 'Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance'],
      dtype='object')

In [5]:
df['ABSTRACT'][1]

'  Rotation invariance and translation invariance have great values in image\nrecognition tasks. In this paper, we bring a new architecture in convolutional\nneural network (CNN) named cyclic convolutional layer to achieve rotation\ninvariance in 2-D symbol recognition. We can also get the position and\norientation of the 2-D symbol by the network to achieve detection purpose for\nmultiple non-overlap target. Last but not least, this architecture can achieve\none-shot learning in some cases using those invariance.\n'

In [6]:
df['CONTEXT'] = df['TITLE'] + ". " + df['ABSTRACT']
df.drop(['ID', 'TITLE', 'ABSTRACT'], axis=1, inplace=True)
df.head(2)

Unnamed: 0,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,CONTEXT
0,1,0,0,0,0,0,Reconstructing Subject-Specific Effect Maps. ...
1,1,0,0,0,0,0,Rotation Invariance Neural Network. Rotation...


In [7]:
df = df[['CONTEXT', 'Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance']]
df.head()

Unnamed: 0,CONTEXT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,Reconstructing Subject-Specific Effect Maps. ...,1,0,0,0,0,0
1,Rotation Invariance Neural Network. Rotation...,1,0,0,0,0,0
2,Spherical polyharmonics and Poisson kernels fo...,0,0,1,0,0,0
3,A finite element approximation for the stochas...,0,0,1,0,0,0
4,Comparative study of Discrete Wavelet Transfor...,1,0,0,1,0,0


In [8]:
target_list = ['Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance']

In [9]:
MAX_LEN=512
TRAIN_BATCH_SIZE=16
VALID_BATCH_SIZE=16
EPOCHS=2
LEARNING_RATE=1e-05

In [10]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")



In [12]:
class CustomDataset:
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tweet = self.df['CONTEXT'].values
        self.targets = self.df[target_list].values

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, item):
        tweet = self.tweet[item]
        tweet = " ".join(tweet.split())

        encodings = tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return{
            "tweet": tweet,
            "input_ids": encodings["input_ids"].flatten(),
            "attention_mask": encodings["attention_mask"].flatten(),
            "targets": torch.FloatTensor(self.targets[item])
        }



In [13]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [14]:
len(train_df), len(val_df), len(test_df)

(16777, 2097, 2098)

In [15]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    dataset = CustomDataset(df, tokenizer, max_len)

    return DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=0
    )

In [16]:
train_dataloader = create_data_loader(train_df, tokenizer, MAX_LEN, TRAIN_BATCH_SIZE)
val_dataloader = create_data_loader(val_df, tokenizer, MAX_LEN, VALID_BATCH_SIZE)
test_dataloader = create_data_loader(test_df, tokenizer, MAX_LEN, TRAIN_BATCH_SIZE)

In [17]:
len(train_dataloader), len(val_dataloader), len(test_dataloader)

(1049, 132, 132)

In [18]:
n_target_list = len(target_list)
n_target_list

6

In [19]:
device = torch("cuda") if torch.cuda.is_available() else "cpu"
device

'cpu'

In [20]:
class TweetClassifier(nn.Module):
    def __init__(self):
        super(TweetClassifier, self).__init__()
        self.bert_model = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(p=0.3)
        self.linear = nn.Linear(768, n_target_list)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids, attention_mask)
        drop_output = self.dropout(outputs[1])
        output = self.linear(drop_output)
        return output

In [21]:
model = TweetClassifier()
model.to(device)

TweetClassifier(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [22]:
loss_fn = nn.BCEWithLogitsLoss().to(device)

optimizer = AdamW(params=model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)



In [27]:
def model_train(model, data_loader, loss_fn, optimizer, scheduler, device):
    model = model.train()

    correct_predictions = 0
    losses = []

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids, attention_mask)

        loss = loss_fn(targets, outputs)

        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return np.mean(losses)

In [28]:
def model_eval(model, data_loader, loss_fn, device):
    model = model.eval()

    losses = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(input_ids, attention_mask)

            loss = loss_fn(targets, outputs)

            losses.append(loss.item())

        return np.mean(losses)
            

In [29]:
from collections import defaultdict

In [30]:
history = defaultdict(list)
loss = np.inf

for epoch in range(1, EPOCHS):
    print(f"Epoch {epoch}/{EPOCHS}")
    print("*" * 10)

    train_loss = model_train(model,
                             train_dataloader,
                             loss_fn,
                             optimizer,
                             scheduler,
                             device)
    
    print(f"Train loss {train_loss}")

    val_loss = model_eval(model,
                          val_dataloader,
                          loss_fn,
                          device)
    
    print(f"Validation loss {val_loss}")

    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)

    if val_loss < loss:
        torch.save(model.state_dict(), "best_model_state.bin")
        loss = val_loss

Epoch 1/2
**********
