In [1]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import pandas as pd
import json
import gc
import re
import string

In [4]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [5]:
import pandas as pd
from google.colab import files
uploaded=files.upload()

Saving trainingdata_updated_testing.xlsx to trainingdata_updated_testing (5).xlsx


In [6]:
df= pd.read_excel('/content/trainingdata_updated_testing.xlsx')
df.columns=['headline','sentiment']
df.dropna(inplace=True)

possible_labels = df.sentiment.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

df['label'] = df.sentiment.replace(label_dict)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 813 entries, 0 to 821
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headline   813 non-null    object
 1   sentiment  813 non-null    object
 2   label      813 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 25.4+ KB


In [7]:
df['label'].unique()

array([0, 1, 2])

In [8]:
df

Unnamed: 0,headline,sentiment,label
0,Apex court stops power gencos from cutting off...,negative,0
1,Price ceiling on power bourses to stay till De...,negative,0
2,BHEL bags 2x660 MW thermal power project from ...,positive,1
3,Centre announces one-time compensation of Rs 2...,positive,1
4,Overseas oil & gas fields' output dips 22% in ...,negative,0
...,...,...,...
817,"Truepill, a digital health unicorn, conducts f...",negative,0
818,"With new cuts, Klarna joins the ranks of compa...",negative,0
819,Better.com planning fourth layoff in less than...,negative,0
820,"TechCrunch+ roundup: SaaS success stats, lever...",negative,0


In [9]:
# Some basic helper functions to clean text by removing urls, emojis, html tags and punctuations.

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Applying helper functions

def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text



df["headline"] = df['headline'].apply(
    lambda x: ' '.join([w for w in str(x).strip().replace("\n", "").replace("\t", "").lower().split()])
).apply(lambda x: remove_URL(x)).apply(lambda x: remove_emoji(x)).apply(lambda x: remove_html(x)).apply(lambda x: remove_punct(x))

In [10]:
from sklearn.model_selection import train_test_split

X = df.headline.values
y = df.label.values

X_train, X_val, y_train, y_val =train_test_split(X, y, test_size=0.1, random_state=2020)

In [11]:
import numpy as np

In [12]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


def preprocessing_for_bert(data):
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    #labels=torch.tensor(data.label.values)

    return input_ids, attention_masks#,labels

In [13]:
all_tweets = np.concatenate([df.headline.values])
# Encode our concatenated data
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_tweets]
# Find the maximum length
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)


Max length:  37


In [14]:
MAX_LEN = 37

# Print sentence 0 and its encoded token ids
token_ids = list(preprocessing_for_bert([X[0]])[0].squeeze().numpy())
print('Original: ', X[0])
print('Token IDs: ', token_ids)

# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks= preprocessing_for_bert(X_train)
val_inputs, val_masks= preprocessing_for_bert(X_val)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  apex court stops power gencos from cutting off supply to bses
Token IDs:  [101, 13450, 2457, 6762, 2373, 8991, 13186, 2013, 6276, 2125, 4425, 2000, 18667, 2229, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Tokenizing data...




In [15]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 4

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [16]:
%%time
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    # class BERT_architecture(nn.Module):
  
    # def __init__(self, bert):
        
    #   super(BERT_architecture, self).__init__()
  
    #   self.bert = bert 
        
    #   # dropout layer
    #   self.dropout = nn.Dropout(0.2)
        
    #   # relu activation function
    #   self.relu =  nn.ReLU()
  
    #   # dense layer 1
    #   self.fc1 = nn.Linear(768,512)
        
    #   # dense layer 2 (Output layer)
    #   self.fc2 = nn.Linear(512,2)
  
    #   #softmax activation function
    #   self.softmax = nn.LogSoftmax(dim=1)


    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=True):
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 1024, 512, 3

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained("bert-large-uncased")

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

CPU times: user 31.2 ms, sys: 4.96 ms, total: 36.2 ms
Wall time: 37.1 ms


In [17]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=2e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [18]:
def print_classwise(outputs, targets):
    cm = sklearn.metrics.confusion_matrix(targets, outputs)
    # negative:0, neutral:1, positive:2
    ogs = ['negative', 'positive', 'neutral']
    print()
    print('='*50)
    for i in range(cm.shape[0]):
        print(ogs[i], ':', cm[i][i]/sum(cm[i]))
    print('='*50)
    print()

In [19]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:


            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy ,label, pred= evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
            cm = sklearn.metrics.confusion_matrix(label,pred)
    # negative:0, neutral:1, positive:2
            ogs = ['negative', 'positive', 'neutral']
            print()
            print('='*50)
            for i in range(cm.shape[0]):
              print(ogs[i], ':', cm[i][i]/sum(cm[i]))

        print("\n")
    
    print("Training complete!")


In [20]:
from sklearn.metrics import confusion_matrix


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    val_pred=[]
    val_label=[]


    # For each batch in our validation set...
    for batch in val_dataloader:
        
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)


        val_label.extend(b_labels.cpu().detach().numpy())
        val_pred.extend(preds.cpu().detach().numpy())

        # val_pred.append(preds)
        # val_labels.append(b_labels)


    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    #from sklearn.metrics import confusion_matrix

    # confusion = confusion_matrix(val_labels, val_pred)

    # print('Confusion Matrix\n')

    # print(confusion)
    

    return val_loss, val_accuracy,val_label,val_pred

In [21]:
import sklearn
print(sklearn.__version__)

1.0.2


In [22]:
import numpy as np
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=6)
train(bert_classifier, train_dataloader, val_dataloader, epochs=6, evaluation=True)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   1.092620   |     -      |     -     |   8.61   
   1    |   40    |   1.008804   |     -      |     -     |   7.10   
   1    |   60    |   0.766378   |     -      |     -     |   7.17   
   1    |   80    |   0.793921   |     -      |     -     |   7.20   
   1    |   100   |   0.693154   |     -      |     -     |   7.17   
   1    |   120   |   0.705421   |     -      |     -     |   7.19   
   1    |   140   |   0.595431   |     -      |     -     |   7.22   
   1    |   160   |   0.436135   |     -      |     -     |   7.20   
   1    |   180   |   0.721626   |     -      |     -     |   7.21   
   1    |   182   |   0.855837   |     -      |     -     |   0.71   
----------------------------------------------------------------------
   1    |    -    |   0.759968   |  0.629683  |   73.81   |   67.95  