In [13]:
# !pip install datasets transformers[sentencepiece]

In [7]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m91.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [8]:
# import necessary libraries 

import torch  
import numpy as np 
import pandas as pd
import torch.nn as nn
from tqdm import tqdm 
from IPython import display 
import matplotlib.pyplot as plt 
display.set_matplotlib_formats('svg')

In [9]:
if torch.cuda.is_available():      # if the GPU(cuda) is available 
    device = torch.device('cuda')  # It assign all the varialbes and function to the GPU
    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else: 
    print("NO GPU available. So, switched to CPU")
    device = torch.device('cpu')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [17]:
data = pd.read_csv('/content/drive/MyDrive/TRUE FOUNDRY INTERNSHIP/airline_sentiment_analysis.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [18]:
dic = {'negative': 0, 'positive': 1}

data['airline_sentiment'] = data['airline_sentiment'].map(dic)

In [19]:
data = data[['text', 'airline_sentiment']]
data.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica plus you've added commercials t...,1
1,@VirginAmerica it's really aggressive to blast...,0
2,@VirginAmerica and it's a really big bad thing...,0
3,@VirginAmerica seriously would pay $30 a fligh...,0
4,"@VirginAmerica yes, nearly every time I fly VX...",1


In [20]:
data.shape

(11541, 2)

In [21]:
# Let's import transformer model
from transformers import BertTokenizer 
from transformers import BertForSequenceClassification, AdamW, BertConfig 

# Get the tokenzier and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',                  # model name
    num_labels = 3,                       # total number of labels
    output_attentions = False,            # Whether the model returns attention weights  
    output_hidden_states = False)         # Whether the model returns all hidden-state
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [22]:
sentences = data.text.values 
labels = data.airline_sentiment.values 

# store the input_ids and attention_masks 
input_ids = []
attention_masks = []

for sent in sentences: 
    encoded_dict = tokenizer.encode_plus(
    sent,                               # taking each sentence and process
    add_special_tokens = True,          # adding [CLS] + sentences + [SEP] 
    max_length = 75,                    # maximum length of the sentences
    pad_to_max_length = True,           
    return_attention_mask = True,       # Getting attention mask [0,0,1,1]
    return_tensors = 'pt')              # It will return the output as pytorch format

    input_ids.append(encoded_dict['input_ids'])     # appending 
    attention_masks.append(encoded_dict['attention_mask'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [23]:
# Conver the input_ids, attention_masks, and labels to tensor! 
input_ids = torch.cat(input_ids, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [25]:
# Let's check the file type! 
print(type(input_ids))

print(type(attention_masks))

print(type(labels))

print(np.unique(labels))

print('\nOriginal: ', sentences[1])
print('\nToken IDs: ', input_ids[1])
print('\n Label: ', labels[1])

<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
[0 1]

Original:  @VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces &amp; they have little recourse

Token IDs:  tensor([  101,  1030,  6261, 14074, 14735,  2009,  1005,  1055,  2428,  9376,
         2000,  8479, 27885,  3630, 25171,  1000,  4024,  1000,  1999,  2115,
         6368,  1005,  5344,  1004, 23713,  1025,  2027,  2031,  2210, 28667,
        22957,  2063,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])

 Label:  tensor(0)


In [26]:
# Seperate Training and Validation split! 
from torch.utils.data import TensorDataset, random_split 

# combine the all inputs to tensor dataset 
dataset = TensorDataset(input_ids, attention_masks, labels)

# determine the range of split 
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size 

# divide the split 
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [27]:
# Let's create a DataLoader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler 

## Dataset needs the batch size for training, recomended batch size is 16 or 32 
batch_size = 32 

## create a dataloader for our training and validation split 
train_dataloader = DataLoader(train_dataset,   
                             sampler = RandomSampler(train_dataset),  # we need to randomize the training data 
                             batch_size = batch_size )
validation_dataloader = DataLoader(val_dataset, 
                                  sampler = SequentialSampler(val_dataset),  # we need to infer the test data sequentially! 
                                  batch_size = batch_size)

In [28]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [29]:
optimizer = AdamW(model.parameters(), 
                 lr = 5e-5, 
                 eps = 1e-8)  # epsilion rate

from transformers import get_linear_schedule_with_warmup 
# Number of training epochs. The BERT authors recommend between 2 and 4. 
epochs = 2

# The Total Number of training steps is [number of batches] * [number of epochs]
total_steps = len(train_dataloader) * epochs

# create a learning rate scheduler! 
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)



In [30]:
def flat_accuracy(preds, labels): 
    pred_flat = np.argmax(preds, axis =1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


#  create a helper function to get the time 
import time
import random
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [31]:
SEED_VAL = 66 
random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
torch.manual_seed(SEED_VAL)
torch.cuda.manual_seed_all(SEED_VAL)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []


# total training time! 
total_t0 = time.time()

for epoch_i in tqdm(range(0, epochs)): 
    t0 = time.time()  # start time 
    
    # reset the total loss for this epoch 
    total_train_loss = 0 
    
    model.train()  # make our model to train mode 
    
    for step, batch in enumerate(train_dataloader): 
        """
        batch[0] -> input_ids
        batch[1] -> attention_mask
        batch[2] -> labels 
        """
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Before starting training we need to make the gradinet as zero 
        model.zero_grad()
        
        output = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask, labels = b_labels)
        loss = output[0]
        logits = output[1]
        
        total_train_loss += loss.item()
        loss.backward()
        # clip the norm of the gradients to 1.0 
        # This is to help prevent the "exploading gradients" problem. 
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)
    
    # validation 
    t0 = time.time()
    model.eval()  # make model to evaluvation 
    
    # tracking variables 
    total_eval_accuracy = 0 
    total_eval_loss = 0
    nb_eval_steps = 0
    
    for batch in validation_dataloader: 
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad(): 
            output = model(b_input_ids, attention_mask = b_input_mask, labels = b_labels, token_type_ids = None)
            loss = output[0]
            logits = output[1]
            
        total_eval_loss += loss.item()
        
        logits = logits.detach().cpu().numpy()   # move variable GPU to CPU 
        label_ids = b_labels.to('cpu').numpy()
        
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
        
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

100%|██████████| 2/2 [04:30<00:00, 135.24s/it]


Training complete!
Total training took 0:04:30 (h:mm:ss)





In [32]:
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.21,0.14,0.95,0:02:08,0:00:05
2,0.07,0.18,0.95,0:02:12,0:00:05
