In [1]:
#pytorch imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

#HuggingFace imports
from transformers import BertModel, BertTokenizer
from transformers.file_utils import PaddingStrategy
from transformers import AdamW, get_linear_schedule_with_warmup

#data science imports
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from tqdm import tqdm

#cs224u imports
import sst, vsm, utils

#python standard libraries
from collections import defaultdict
import time
import os

%load_ext autoreload
%autoreload 2

device = "cpu" if not torch.cuda.is_available() else 'cuda:0'
#device = "cpu"
device

'cuda:0'

In [2]:
!nvidia-smi

Fri Feb 11 18:06:24 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000001:00:00.0 Off |                  Off |
| N/A   29C    P0    33W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Assemble Data(sets)

In [3]:
train_df_path = '/home/americanthinker/notebooks/pytorch/cs224u/data/sentiment/sst3-train.csv'
dev_df_path = '/home/americanthinker/notebooks/pytorch/cs224u/data/sentiment/sst3-dev.csv'
SST_HOME = os.path.join('data', 'sentiment')

In [4]:
sst_train = sst.train_reader(SST_HOME, dedup=True)
sst_train.info()
sst_dev = sst.dev_reader(SST_HOME, dedup=True)
sst_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8534 entries, 0 to 8533
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   example_id  8534 non-null   object
 1   sentence    8534 non-null   object
 2   label       8534 non-null   object
 3   is_subtree  8534 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 266.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   example_id  1100 non-null   object
 1   sentence    1100 non-null   object
 2   label       1100 non-null   object
 3   is_subtree  1100 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 34.5+ KB


In [5]:
#import non-subtree version of training data
train_df = pd.read_csv(train_df_path, usecols=['sentence', 'label', 'is_subtree'])
train_df = train_df[train_df['is_subtree'] == 0]
train_df.drop('is_subtree', axis=1, inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8544 entries, 0 to 318573
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  8544 non-null   object
 1   label     8544 non-null   object
dtypes: object(2)
memory usage: 200.2+ KB


In [6]:
dev_df = pd.read_csv(dev_df_path, usecols=['sentence', 'label', 'is_subtree'])
dev_df = dev_df[dev_df['is_subtree'] == 0]
dev_df.drop('is_subtree', axis=1, inplace=True)
dev_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1101 entries, 0 to 1100
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  1101 non-null   object
 1   label     1101 non-null   object
dtypes: object(2)
memory usage: 25.8+ KB


In [7]:
#convert text/labels to model consumable input
def create_dataset(df: pd.DataFrame) -> np.array:
    label_types = sorted(df.label.unique().tolist())
    assert ['negative', 'neutral', 'positive'] == label_types
    label_map = {label:index for index, label in enumerate(label_types)}
    
    text = df.sentence.values.tolist()
    labels = df.label.apply(lambda x: label_map[x]).values.tolist()
    assert len(text) == len(labels)
    
    return text, labels

train_X, train_y = create_dataset(train_df)
dev_X, dev_y = create_dataset(dev_df)

In [8]:
#instantiate bert tokenizer
weights_name = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(weights_name)

In [9]:
#design dataset class for use with DataLoader

class getSentences(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __repr__(self):
        return f'Sentences: {len(self.sentences)}     Labels: {len(self.labels)}'
    
    def __len__(self):
        return (len(self.sentences))
    
    def __getitem__(self,index):
        sentence = self.sentences[index]
        encoding = self.tokenizer.encode_plus(
                          sentence,
                          add_special_tokens=True,
                          max_length=self.max_len,
                          truncation=True,
                          return_token_type_ids=False,
                          padding=PaddingStrategy.MAX_LENGTH,
                          return_attention_mask=True,
                          return_tensors='pt')
        
        return {'text' : sentence,
                'input_id': encoding['input_ids'].flatten(),
                'attention_mask':encoding['attention_mask'].flatten(),
                'labels': torch.tensor(self.labels[index], dtype = torch.long)
               }

In [10]:
#instantiate Dataset and build DataLoader

BATCH_SIZE = 16
MAX_LEN = 300
NUM_TRAIN_SAMPLES = len(train_X)
NUM_VAL_SAMPLES = len(dev_X)

training_data = getSentences(
                    sentences = train_X,
                    labels = train_y,
                    tokenizer = bert_tokenizer,
                    max_len = MAX_LEN)

val_data = getSentences(
                sentences = dev_X,
                labels = dev_y,
                tokenizer = bert_tokenizer,
                max_len = MAX_LEN)

train_loader = DataLoader(training_data, BATCH_SIZE, shuffle = False)
val_loader = DataLoader(val_data, BATCH_SIZE, shuffle = False)

#### Smaller datasets for debugging

In [47]:
train_subset = train_X[:BATCH_SIZE * 4]
dev_subset = dev_X[:BATCH_SIZE * 4]
train_y_subset = train_y[:BATCH_SIZE * 4]
dev_y_subset = dev_y[:BATCH_SIZE * 4]

NUM_TRAIN_SUBSET = len(train_subset)
NUM_VAL_SUBSET = len(dev_subset)

train_subset_data = getSentences(
                    sentences = train_subset,
                    labels = train_y_subset,
                    tokenizer = bert_tokenizer,
                    max_len = MAX_LEN)

val_subset_data = getSentences(
                sentences = dev_subset,
                labels = dev_y_subset,
                tokenizer = bert_tokenizer,
                max_len = MAX_LEN)

train_subset_loader = DataLoader(train_subset_data, BATCH_SIZE, shuffle = False)
val_subset_loader = DataLoader(val_subset_data, BATCH_SIZE, shuffle = False)

#### Test sample batch

In [11]:
#down and dirty sample batch check to ensure everything loaded correctly
sample_batch = next(iter(train_loader))
assert (sample_batch['input_id'].shape[0], sample_batch['input_id'].shape[1]) == (BATCH_SIZE, MAX_LEN)

#view of the data dimensions
sample_batch.keys()
sample_batch['labels']
sample_batch['input_id'].shape, sample_batch['attention_mask'].shape, sample_batch['labels'].shape


dict_keys(['text', 'input_id', 'attention_mask', 'labels'])

tensor([2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2])

(torch.Size([16, 300]), torch.Size([16, 300]), torch.Size([16]))

## Instantiate Model

In [12]:
# input to the model --> encoded_ids and attention mask

class BertClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super(BertClassifier,self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(p = 0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size,num_classes)
        self.softmax = nn.Softmax(dim = 1)
        
    def forward(self,input_ids, attention_mask):
        temp = self.bert(input_ids, attention_mask)  # Here we have added one linear layer on top of 
        pooled_output = temp[1]                             # BERT-base with number of output = 3 
        out = self.dropout(pooled_output)          
        out = self.linear(out)
        return out

In [13]:
num_classes = 3
model = BertClassifier(weights_name, 3)
model = model.to(device)
model.bert.device

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


device(type='cuda', index=0)

#### Test Forward Pass

In [15]:
#test forward pass to ensure correct output dims
input_ids = sample_batch['input_id']
mask = sample_batch['attention_mask']

sample_linear_output = model(input_ids,mask)
soft = nn.Softmax(dim=1)
final_out = soft(sample_linear_output)

assert (final_out.shape[0], final_out.shape[1]) == (BATCH_SIZE, num_classes)

final_out
_, pred_labels = torch.max(final_out, dim=1)
pred_labels

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking arugment for argument index in method wrapper_index_select)

## Initiate Model Training and Evaluation

In [14]:
learning_rate = 1e-5
EPOCHS = 10

steps = len(train_loader) * EPOCHS
loss_fn = torch.nn.CrossEntropyLoss().to(device)

optim = AdamW(params = model.parameters(),lr = learning_rate, correct_bias= False)
scheduler = get_linear_schedule_with_warmup(optimizer = optim,
                                                        num_warmup_steps = 0,
                                                        num_training_steps = steps)

In [63]:
def train_model(model, data_loader=train_loader, loss_function=loss_fn, optimizer=optim, scheduler=scheduler, n_examples=NUM_TRAIN_SAMPLES):
    '''
    Model training function that represents one pass through the data.
    Returns accuracy and mean total loss.  This function is meant to be 
    paired with an "eval model" function.
    '''
    
    model.train()
    batches = len(data_loader)
    batch_count = 0
    train_loss = []
    correct_predictions = 0
   
    
    for d in tqdm(data_loader):
        
        #grab data in batches and move to GPU
        input_ids = d['input_id'].to(device)
        masks = d['attention_mask'].to(device)
        labels = d['labels'].to(device)
        batch_count += 1
#         if batch_count % 100 == 0:
#             print(f'Batch "{batch_count}" of {batches} total batches')
        
        #forward propagation
        predictions = model(input_ids, masks)
        loss = loss_function(predictions, labels)
        _, pred_classes = torch.max(predictions, dim=1)
        
        #back propagation
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        #collect loss and acc measures
        train_loss.append(loss.item())
        correct_predictions += torch.sum(pred_classes==labels)
    
    return (correct_predictions/n_examples).cpu().numpy(), np.mean(train_loss)

In [64]:
def eval_model(model, data_loader=val_loader, loss_function=loss_fn, n_examples=NUM_VAL_SAMPLES):
    '''
    Model evaluation function that represents one pass through the data.
    Returns accuracy and mean total loss.  This function is meant to be 
    paired with the "train_model" function.
    '''
    
    model.eval()
    eval_loss = []
    correct_predictions = 0
    all_predictions = []
    all_labels = [] 
    
    with torch.no_grad():
        for d in tqdm(data_loader):
            
            input_ids = d['input_id'].to(device)
            masks = d['attention_mask'].to(device)
            labels = d['labels'].to(device)
            
            #forward prop for inference
            predictions = model(input_ids, masks)
            loss = loss_function(predictions, labels)
            _,pred_classes = torch.max(predictions, dim=1)
            
            #collect preds/labels for class_report
            all_predictions.extend(pred_classes.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            
            #collect loss and acc measures
            eval_loss.append(loss.item())
            correct_predictions += torch.sum(pred_classes==labels)
    
    report = classification_report(all_labels, 
                                   all_predictions, 
                                   labels=[0,1,2], 
                                   target_names=['negative', 'neutral', 'positive'])
    
    return (correct_predictions / n_examples).cpu().numpy(), np.mean(eval_loss), report
    

In [65]:
def run(model, epochs: int=20):
    
    tracking = defaultdict(list)
    best_accuracy = 0
    best_report = None
    
    EPOCHS = epochs
    
    start = time.perf_counter()
    
    for epoch in range(EPOCHS):
        print(f'epoch : {epoch+1}/{EPOCHS}')
        
        train_acc, train_loss = train_model(model, data_loader=train_loader, n_examples=NUM_TRAIN_SAMPLES)

        val_acc , val_loss, report = eval_model(model, data_loader=val_loader, n_examples=NUM_VAL_SAMPLES)

        tracking['train_acc'].append(train_acc)
        tracking['train_loss'].append(train_loss)
        tracking['val_acc'].append(val_acc)
        tracking['val_loss'].append(val_loss)
        
        scores = np.round([train_loss, train_acc, val_loss, val_acc],3)
        print(f'train_loss: {scores[0]}, train_acc: {scores[1]}\nval_loss: {scores[2]}, val_acc: {scores[3]}')
            
        if val_acc > best_accuracy:
            #best_model_name = f'drive/MyDrive/Bert Sentiment/models/best_model_state_{val_acc}.bin'
            #torch.save(model.state_dict(), best_model_name)
            best_accuracy = val_acc
            best_report = report
    
    end = time.perf_counter() - start
    print(f'Total time for {EPOCHS} epochs: {np.round(end/60, 1)} minutes')
    print(best_report)

In [66]:
run(model=model, epochs=EPOCHS)

epoch : 1/10


100%|██████████| 534/534 [04:34<00:00,  1.95it/s]
100%|██████████| 69/69 [00:12<00:00,  5.66it/s]


train_loss: 0.456, train_acc: 0.822
val_loss: 0.759, val_acc: 0.706
epoch : 2/10


100%|██████████| 534/534 [04:34<00:00,  1.95it/s]
100%|██████████| 69/69 [00:12<00:00,  5.66it/s]


train_loss: 0.333, train_acc: 0.881
val_loss: 0.925, val_acc: 0.717
epoch : 3/10


100%|██████████| 534/534 [04:34<00:00,  1.95it/s]
100%|██████████| 69/69 [00:12<00:00,  5.66it/s]


train_loss: 0.237, train_acc: 0.922
val_loss: 1.036, val_acc: 0.712
epoch : 4/10


100%|██████████| 534/534 [04:34<00:00,  1.95it/s]
100%|██████████| 69/69 [00:12<00:00,  5.66it/s]


train_loss: 0.187, train_acc: 0.944
val_loss: 1.242, val_acc: 0.707
epoch : 5/10


100%|██████████| 534/534 [04:34<00:00,  1.95it/s]
100%|██████████| 69/69 [00:12<00:00,  5.66it/s]


train_loss: 0.165, train_acc: 0.952
val_loss: 1.393, val_acc: 0.694
epoch : 6/10


100%|██████████| 534/534 [04:33<00:00,  1.95it/s]
100%|██████████| 69/69 [00:12<00:00,  5.65it/s]


train_loss: 0.145, train_acc: 0.961
val_loss: 1.576, val_acc: 0.687
epoch : 7/10


100%|██████████| 534/534 [04:33<00:00,  1.95it/s]
100%|██████████| 69/69 [00:12<00:00,  5.65it/s]


train_loss: 0.147, train_acc: 0.963
val_loss: 1.633, val_acc: 0.681
epoch : 8/10


100%|██████████| 534/534 [04:33<00:00,  1.95it/s]
100%|██████████| 69/69 [00:12<00:00,  5.65it/s]


train_loss: 0.184, train_acc: 0.95
val_loss: 1.683, val_acc: 0.678
epoch : 9/10


100%|██████████| 534/534 [04:34<00:00,  1.95it/s]
100%|██████████| 69/69 [00:12<00:00,  5.66it/s]


train_loss: 0.614, train_acc: 0.846
val_loss: 1.811, val_acc: 0.679
epoch : 10/10


100%|██████████| 534/534 [04:33<00:00,  1.95it/s]
100%|██████████| 69/69 [00:12<00:00,  5.65it/s]

train_loss: 0.522, train_acc: 0.872
val_loss: 1.811, val_acc: 0.679
Total time for 10 epochs: 47.7 minutes
              precision    recall  f1-score   support

    negative       0.80      0.78      0.79       428
     neutral       0.43      0.33      0.37       229
    positive       0.75      0.86      0.80       444

    accuracy                           0.72      1101
   macro avg       0.66      0.65      0.65      1101
weighted avg       0.70      0.72      0.71      1101






In [70]:
!nvidia-smi

Fri Feb 11 21:54:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000001:00:00.0 Off |                  Off |
| N/A   30C    P0    33W / 250W |   2807MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [69]:
torch.cuda.empty_cache()