# Load data

In [7]:
import torch
import pandas as pd

# load the data
X_train = pd.read_csv('../data/train.csv')
X_test = pd.read_csv('../data/test.csv')

# load the labels
y_train = pd.read_csv('../data/train_labels.csv')
y_test = pd.read_csv('../data/test_labels.csv')

In [20]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class TextDataset(Dataset):
    def __init__(self, texts, targets, max_length=180, tokenizer_name='bert-base-uncased'):
        self.texts = texts
        self.targets = targets
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        
        return {
            'input_ids': torch.as_tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.as_tensor(attention_mask, dtype=torch.long),
            'targets': torch.as_tensor(target, dtype=torch.long),
            'text': text
        }

In [26]:
train_dataset = TextDataset(texts=X_train['headline'], targets=y_train['is_sarcastic'])
test_dataset = TextDataset(texts=X_test['headline'], targets=y_test['is_sarcastic'])

In [37]:
from torch.utils.data import DataLoader

BATCH_SIZE = 32
torch.manual_seed(1702)
train_loader = DataLoader(train_dataset, 
                          batch_size=BATCH_SIZE,
                          shuffle=True)
test_loader = DataLoader(test_dataset,
                         batch_size=len(test_dataset))
next(iter(train_loader))

{'input_ids': tensor([[  101,  3763,  2050,  ...,     0,     0,     0],
         [  101,  2129,  8398,  ...,     0,     0,     0],
         [  101,  5747,  8069,  ...,     0,     0,     0],
         ...,
         [  101,  4485,  5581,  ...,     0,     0,     0],
         [  101, 21442,  2615,  ...,     0,     0,     0],
         [  101,  2167,  4420,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'targets': tensor([0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
         1, 0, 1, 1, 0, 1, 1, 0]),
 'text': ['latina business owner faces death threats for appearing onstage at donald trump rally',
  'how trump university relied heavily on the craft of con men',
  "bush hopes recession doesn't affect sales of his memoirs",
  'uber pulls a u-tu

# Build model

In [None]:
from torch import cuda
torch.cuda.empty_cache()
device = 'cuda' if cuda.is_available() else 'cpu'
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")


In [39]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
print('device:', device)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device: mps


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [40]:
EPOCHS = 3

optimizer = AdamW(model.parameters(),
    lr = 2e-5, 
    eps = 1e-8
)



# training model

In [41]:
import time
import datetime
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [42]:
training_stats = []
epoch_loss_train = []
total_t0 = time.time()

# TRAINING
for epoch in range(1, EPOCHS + 1):
    model.train()
    t0 = time.time()
    print("")
    print("================ Epoch {:} / {:} ================".format(epoch, EPOCHS))
    train_all_predictions = []
    train_all_true_labels = []
    for step, data in enumerate(train_loader):
        if step % 40 == 0 and not step == 0:
            elapsed = int(round(time.time() - t0))
            elapsed = str(datetime.timedelta(seconds=elapsed))
            print(
                "  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.".format(
                    step, len(train_loader), elapsed
                )
            )

        targets = data["targets"].to(device)
        mask = data["attention_mask"].to(device)
        ids = data["input_ids"].to(device)

        model.zero_grad()

        loss, logits = model(
            ids, token_type_ids=None, attention_mask=mask, labels=targets
        ).to_tuple()
        epoch_loss_train.append(loss.item())

        cpu_logits = logits.cpu().detach().numpy()
        train_all_predictions.extend(np.argmax(cpu_logits, axis=1).flatten())
        train_all_true_labels.extend(targets.cpu().numpy())

        loss.backward()
        optimizer.step()
    train_accuracy = accuracy_score(train_all_true_labels, train_all_predictions)
    train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(
        train_all_true_labels, train_all_predictions, average="binary"
    )
    print("")
    print('---TRAIN METRICS---')
    print(f"Loss: {np.mean(epoch_loss_train):.4f}")
    print(f"Accuracy: {train_accuracy:.4f}")
    print(f"Precision: {train_precision:.4f}")
    print(f"Recall: {train_recall:.4f}")
    print(f"F1-Score: {train_f1:.4f}")
    print("")
    
    # VALIDATION
    print("Running validation ...")
    print("")
    model.eval()
    epoch_loss_test = []
    test_all_predictions = []
    test_all_true_labels = []
    for data in test_loader:
        targets = data["targets"].to(device)
        mask = data["attention_mask"].to(device)
        ids = data["input_ids"].to(device)
        
        with torch.no_grad():
            loss, logits = model(ids, token_type_ids=None, attention_mask=mask, labels=targets).to_tuple()
            
        epoch_loss_test.append(loss.item())
        cpu_logits = logits.cpu().detach().numpy()
        test_all_predictions.extend(np.argmax(cpu_logits, axis=1).flatten())
        test_all_true_labels.extend(targets.cpu().numpy())
    test_accuracy = accuracy_score(test_all_true_labels, test_all_predictions)
    test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
        test_all_true_labels, test_all_predictions, average="binary"
    )
    print("")
    print('---TEST METRICS---')
    print(f"Loss: {np.mean(epoch_loss_test):.4f}")
    print(f"Accuracy: {test_accuracy:.4f}")
    print(f"Precision: {test_precision:.4f}")
    print(f"Recall: {test_recall:.4f}")
    print(f"F1-Score: {test_f1:.4f}")
    
    training_stats.append(
            {
            'epoch': epoch,
            'Training Loss': np.mean(epoch_loss_train),
            'Training Accuracy': train_accuracy,
            'Training Precision': train_precision,
            'Training Recall': train_recall,
            'Training F1': train_f1,
            'Validation Loss': np.mean(epoch_loss_test),
            'Validation Accuracy': test_accuracy,
            'Validation Precision': test_precision,
            'Validation Recall': test_recall,
            'Validation F1': test_f1
        }
    )


  Batch    40  of    624.    Elapsed: 0:01:05.
  Batch    80  of    624.    Elapsed: 0:01:49.
  Batch   120  of    624.    Elapsed: 0:02:33.
  Batch   160  of    624.    Elapsed: 0:03:17.
  Batch   200  of    624.    Elapsed: 0:04:01.
  Batch   240  of    624.    Elapsed: 0:04:44.
  Batch   280  of    624.    Elapsed: 0:05:28.
  Batch   320  of    624.    Elapsed: 0:06:13.
  Batch   360  of    624.    Elapsed: 0:06:57.
  Batch   400  of    624.    Elapsed: 0:07:41.
  Batch   440  of    624.    Elapsed: 0:08:25.
  Batch   480  of    624.    Elapsed: 0:09:09.
  Batch   520  of    624.    Elapsed: 0:09:53.
  Batch   560  of    624.    Elapsed: 0:10:37.
  Batch   600  of    624.    Elapsed: 0:11:21.

---TRAIN METRICS---
Loss: 0.2819
Accuracy: 0.8805
Precision: 0.8693
Recall: 0.8833
F1-Score: 0.8762

Running validation ...



RuntimeError: MPS backend out of memory (MPS allocated: 44.85 GB, other allocations: 425.69 MB, max allowed: 45.90 GB). Tried to allocate 4.40 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
df_statistics = pd.DataFrame(data=training_stats)
df_statistics = df_statistics.set_index('epoch')
df_statistics

In [None]:
import matplotlib.pyplot as plt

plt.plot(df_statistics['Training Loss'], 'b-o', label='Training')
plt.plot(df_statistics['Validation Loss'], 'g-o', label='Validation')
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3])

plt.show()