In [74]:
import pandas as pd
import numpy as np
import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer

In [75]:
data = pd.read_csv(r'C:\Users\Dell\Downloads\News (1)\ireland-news-headlines.csv')
data['headline_category'] = data.apply(lambda x: x['headline_category'].split('.')[0], axis = 1)
data = data.dropna()
data = data.sample(frac = 0.05)

targets = list(data['headline_category'].unique())
max_len = max([len(i) for i in list(data['headline_text'])])

one_hot = pd.get_dummies(data['headline_category'])
data = data.join(one_hot)
data = data.drop('headline_category', axis = 1)

In [76]:
train_index = data.sample(int(data.shape[0]*0.6)).index
train_df = data.loc[train_index]
data = data.drop(train_index).reset_index(drop = True)
validate_index = data.sample(int(data.shape[0]*0.5)).index
validate_df = data.loc[validate_index]
test_df = data.drop(validate_index)
del data, validate_index, train_index, one_hot

In [77]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [78]:
class CustomDataHandler(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.targets = self.df[targets].values
        self.headlines = list(self.df.headline_text)
        
    def __len__(self):
        return len(self.headlines)
    
    def __getitem__(self, index):
        headline = str(self.headlines[index])
        headline = " ".join(headline.split())
        inputs = self.tokenizer.encode_plus(
            headline,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            padding = 'max_length',
            return_token_type_ids = True,
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        
        return {
            'input_ids':inputs['input_ids'].flatten(),
            'attention_mask':inputs['attention_mask'].flatten(),
            'token_type_ids':inputs['token_type_ids'].flatten(),
            'targets':torch.FloatTensor(self.targets[index])
        }

In [79]:
train_dataset = CustomDataHandler(train_df,tokenizer, max_len)
validate_dataset = CustomDataHandler(validate_df,tokenizer, max_len)

In [80]:
trainloader = DataLoader(train_dataset, shuffle = True, batch_size =256)
validateloader = DataLoader(validate_dataset, shuffle = False, batch_size =32)

In [81]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [82]:
class pretrained_BERT(nn.Module):
    def __init__(self,):
        super(pretrained_BERT,self).__init__()
        self.bert_mod = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(768,6)
    
    def forward(self, inputs_ids, attention_mast, token_type_ids):
        output = self.bert_mod(inputs_ids, attention_mast, token_type_ids)
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        
        return output

In [83]:
model = pretrained_BERT()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


pretrained_BERT(
  (bert_mod): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [84]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitLoss()(outputs, targets)

optimizer = torch.optim.Adam(params = model.parameters(), lr = 1e-05)

In [87]:
def train(epochs, trainloader, validateloader, model, optimizer):
    train_loss = []
    validate_loss = []
    for e in range(1,epochs+1):
        print(e)
        model.train()
        for ind, batch in enumerate(trainloader):
            
            input_ids = batch['input_ids'].to(device, dtype = torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.float)
            output = model(input_ids, attention_mask, token_type_ids)
            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())

        
        model.eval()
        with torch.no_grad():
            for index, batch in enumerate(validateloader):
                input_ids = batch['input_ids'].to(device, dtype = torch.long)
                attention_mask = batch['attention_mask'].to(device, dtype = torch.long)
                token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
                targets = batch['targets'].to(device, dtype = torch.float)
                output = model(input_ids, attention_mask, token_type_ids)
                loss = loss_fn(outputs, targets)
                validate_loss.append(loss.item())
    #plot graphs
            
    return model

In [88]:
trained_model = train(1, trainloader, validateloader, model, optimizer)

TypeError: 'module' object is not callable