In [69]:
import warnings
warnings.filterwarnings("ignore")

In [70]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
from sklearn import metrics

In [71]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [72]:
df = pd.read_csv(
    '../input/newsCorpora.csv', 
    sep='\t', 
    names=['ID','TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']
)

df = df[['TITLE','CATEGORY']]
df.head()

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b


In [73]:
my_dict = {
    'e':'Entertainment',
    'b':'Business',
    't':'Science',
    'm':'Health'
}

def update_cat(x):
    return my_dict[x]

df['CATEGORY'] = df['CATEGORY'].apply(lambda x: update_cat(x))

encode_dict = {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

df['ENCODE_CAT'] = df['CATEGORY'].apply(lambda x: encode_cat(x))

In [74]:
df.head(3)

Unnamed: 0,TITLE,CATEGORY,ENCODE_CAT
0,"Fed official says weak data caused by weather,...",Business,0
1,Fed's Charles Plosser sees high bar for change...,Business,0
2,US open: Stocks fall after Fed official hints ...,Business,0


In [75]:
df["ENCODE_CAT"].value_counts()

2    152469
0    115967
1    108344
3     45639
Name: ENCODE_CAT, dtype: int64

In [76]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [77]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return self.len
        
    def __getitem__(self, index):
        title = str(self.data.TITLE[index])
        title = " ".join(title.split())
        
        inputs = self.tokenizer.encode_plus(
            title,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True,
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)
        }

In [78]:
train_size = 0.8
train_dataset = df.sample(frac=train_size, random_state=200)
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print(f"full dataset: {df.shape}")
print(f"train dataset: {train_dataset.shape}")
print(f"test dataset: {test_dataset.shape}")

train_set = Triage(train_dataset, tokenizer, MAX_LEN)
test_set = Triage(test_dataset, tokenizer, MAX_LEN)

full dataset: (422419, 3)
train dataset: (337935, 3)
test dataset: (84484, 3)


In [79]:
train_param = {
    "batch_size": TRAIN_BATCH_SIZE,
    "shuffle": True,
    "num_workers": 0,
}

test_param = {
    "batch_size": VALID_BATCH_SIZE,
    "shuffle": True,
    "num_workers": 0,
}

train_loader = DataLoader(train_set, **train_param)
test_loadeer = DataLoader(test_set, **test_param)

In [80]:
class BERTClass(nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.drop = nn.Dropout(0.3)
        self.dense = nn.Linear(768,4)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1 = self.bert(
            ids, 
            attention_mask=mask,
            token_type_ids=token_type_ids,
        )
        output_2 = self.drop(output_1)
        output = self.dense(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [81]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    params=model.parameters(),
    lr=LEARNING_RATE,
)

In [82]:
def calculate_accuracy(big_idx, targets):
    num_correct = (big_idx==targets).sum().item()
    return num_correct

In [83]:
def train(epoch):
    train_loss = 0
    num_correct = 0
    train_steps = 0
    train_examples = 0
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        train_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        num_correct += calculate_accuracy(big_idx, targets)

        train_steps += 1
        train_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = train_loss / train_steps
            accuracy_step = (num_correct*100) / train_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accuracy_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(num_correct*100) / train_examples}')
    epoch_loss = train_loss / train_steps
    epoch_accuracy = (num_correct*100) / train_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accuracy}")

In [84]:
for epoch in range(EPOCHS):
    train(epoch)

Training Loss per 5000 steps: 1.4588730335235596
Training Accuracy per 5000 steps: 25.0
Training Loss per 5000 steps: 0.3574697999358628
Training Accuracy per 5000 steps: 87.70245950809839


KeyboardInterrupt: 

In [None]:
def valid(model, test_loader):
    train_loss = 0
    num_correct = 0
    num_wrong = 0
    total = 0
    train_steps = 0
    train_examples = 0
    model.eval()
    with torch.no_grad():
        for _,data in enumerate(test_loader, 0):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.long)

            outputs = model(ids, mask, token_type_ids)
            loss = loss_function(outputs, targets)
            train_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            num_correct += calculate_accuracy(big_idx, targets)

            train_steps += 1
            train_examples += targets.size(0)
        
            if _%100==0:
                loss_step = train_loss / train_steps
                accuracy_step = (num_correct*100) / train_examples 
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accuracy_step}")

    epoch_loss = train_loss / train_examples
    epoch_accuracy = (num_correct*100) / train_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accuracy}")
    
    return epoch_accuracy

In [None]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

In [None]:
output_model_file = './models/pytorch_distilbert_news.bin'
output_vocab_file = './models/vocab_distilbert_news.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')