In [15]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [27]:
with open('SMSSpamCollection') as f:
    data = []
    for line in f.readlines():
        split = line.split('\t')
        data.append({'label': 1 if split[0] == 'spam' else 0,
                     'text': split[1]},)
data[0:3]

[{'label': 0,
  'text': 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n'},
 {'label': 0, 'text': 'Ok lar... Joking wif u oni...\n'},
 {'label': 1,
  'text': "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n"}]

In [31]:
df = pd.DataFrame(data)
df

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...\n
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,1,This is the 2nd time we have tried 2 contact u...
5570,0,Will Ã¼ b going to esplanade fr home?\n
5571,0,"Pity, * was in mood for that. So...any other s..."
5572,0,The guy did some bitching but I acted like i'd...


In [32]:
text = df.text.values
text

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n',
       'Ok lar... Joking wif u oni...\n',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n",
       ..., 'Pity, * was in mood for that. So...any other suggestions?\n',
       "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free\n",
       'Rofl. Its true to its name\n'], dtype=object)

In [33]:
labels = df.label.values
labels

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [85]:
# We need to preprocess the text source before feeding it to BERT. To do so, we download the BertTokenizer:

tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [78]:
rand_index = random.randint(0, len(text)-1) # to select one random SMS index
print(rand_index)
sms_tokens = tokenizer.tokenize(text[rand_index])
print(sms_tokens)
sms_ids = tokenizer.convert_tokens_to_ids(sms_tokens)
print(sms_ids)

3636
['it', "'", 's', 'not', 'that', 'you', 'make', 'me', 'cry', '.', 'it', "'", 's', 'just', 'that', 'when', 'all', 'our', 'stuff', 'happens', 'on', 'top', 'of', 'everything', 'else', ',', 'it', 'pushes', 'me', 'over', 'the', 'edge', '.', 'you', 'don', "'", 't', 'under', '##dt', '##and', 'how', 'often', 'i', 'cry', 'over', 'my', 'sorry', ',', 'sorry', 'life', '.']
[2009, 1005, 1055, 2025, 2008, 2017, 2191, 2033, 5390, 1012, 2009, 1005, 1055, 2074, 2008, 2043, 2035, 2256, 4933, 6433, 2006, 2327, 1997, 2673, 2842, 1010, 2009, 13956, 2033, 2058, 1996, 3341, 1012, 2017, 2123, 1005, 1056, 2104, 11927, 5685, 2129, 2411, 1045, 5390, 2058, 2026, 3374, 1010, 3374, 2166, 1012]


In [84]:
def print_rand_sentence():
    '''Displays the tokens and respective IDs of a random text sample'''
    index = random.randint(0, len(text)-1) 
    table = np.array([tokenizer.tokenize(text[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
    print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒══════════╤═════════════╕
│ Tokens   │   Token IDs │
╞══════════╪═════════════╡
│ hi       │        7632 │
├──────────┼─────────────┤
│ happy    │        3407 │
├──────────┼─────────────┤
│ birthday │        5798 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ hi       │        7632 │
├──────────┼─────────────┤
│ hi       │        7632 │
├──────────┼─────────────┤
│ hi       │        7632 │
├──────────┼─────────────┤
│ hi       │        7632 │
├──────────┼─────────────┤
│ hi       │        7632 │
├──────────┼─────────────┤
│ hi       │        7632 │
├──────────┼─────────────┤
│ hi       │        7632 │
╘══════════╧═════════════╛


In [106]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
    '''
    Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
    '''
    # the encode_plus method from the tokenizer object is commonly used in transformers for Natural Language Processing tasks.
    return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

preprocessing(text[1], tokenizer)


{'input_ids': tensor([[  101,  7929,  2474,  2099,  1012,  1012,  1012, 16644, 15536,  2546,
          1057,  2006,  2072,  1012,  1012,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])}

In [107]:
for sample in text:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) 
    attention_masks.append(encoding_dict['attention_mask'])

# We can observe the token IDs for a text sample and recognize the presence of the special tokens 101 [CLS] and 102 [SEP], as well as the padding 0 [PAD] up to the desired max_length:
    
print(type(token_id[0]), type(attention_masks[0])) # lists of tensors generated
token_id[1]

<class 'torch.Tensor'> <class 'torch.Tensor'>


tensor([[  101,  7929,  2474,  2099,  1012,  1012,  1012, 16644, 15536,  2546,
          1057,  2006,  2072,  1012,  1012,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]])

In [109]:
 # This line concatenates the list of tensors in token_id along the first dimension (dimension 0). The result is a single tensor.
    
token_id = torch.cat(token_id, dim = 0)
print(type(token_id))
token_id


<class 'torch.Tensor'>


tensor([[  101,  2175,  2127,  ..., 28194,  1012,   102],
        [  101,  7929,  2474,  ...,     0,     0,     0],
        [  101,  2489,  4443,  ...,  2000,  4374,   102],
        ...,
        [  101, 12063,  1010,  ...,     0,     0,     0],
        [  101,  1996,  3124,  ...,  2489,   102,     0],
        [  101, 20996, 10258,  ...,     0,     0,     0]])

In [110]:
# same as above: converting the 'attention_mask' list of tensors to a  single tensor
attention_masks = torch.cat(attention_masks, dim = 0)


In [111]:
# This line converts the list of labels into a PyTorch tensor.
labels = torch.tensor(labels)
labels

  labels = torch.tensor(labels)


tensor([0, 0, 1,  ..., 0, 0, 0])

In [118]:
print(token_id[1])
print(tokenizer.decode(token_id[1]))
print(tokenizer.tokenize(tokenizer.decode(token_id[1])))
print(attention_masks[1])

tensor([  101,  7929,  2474,  2099,  1012,  1012,  1012, 16644, 15536,  2546,
         1057,  2006,  2072,  1012,  1012,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0])
[CLS] ok lar... joking wif u oni... [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
['[CLS]', 'ok', 'la', '##r', '.', '.', '.', 'joking', 'wi', '##f', 'u', 'on', '##i', '.', '.', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])


In [122]:
def print_rand_sentence_encoding():
    '''Displays tokens, token IDs and attention mask of a random text sample'''
    index = random.randint(0, len(text) - 1)
    tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
    token_ids = [i.numpy() for i in token_id[index]]
    attention = [i.numpy() for i in attention_masks[index]]

    table = np.array([tokens, token_ids, attention]).T
    print(tabulate(table, 
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

╒══════════╤═════════════╤══════════════════╕
│ Tokens   │   Token IDs │   Attention Mask │
╞══════════╪═════════════╪══════════════════╡
│ [CLS]    │         101 │                1 │
├──────────┼─────────────┼──────────────────┤
│ he       │        2002 │                1 │
├──────────┼─────────────┼──────────────────┤
│ says     │        2758 │                1 │
├──────────┼─────────────┼──────────────────┤
│ hi       │        7632 │                1 │
├──────────┼─────────────┼──────────────────┤
│ and      │        1998 │                1 │
├──────────┼─────────────┼──────────────────┤
│ to       │        2000 │                1 │
├──────────┼─────────────┼──────────────────┤
│ get      │        2131 │                1 │
├──────────┼─────────────┼──────────────────┤
│ your     │        2115 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ass      │        4632 │                1 │
├──────────┼─────────────┼──────────────────┤
│ back     │        2067 │        

In [126]:
# We split the dataset into train (80%) and validation (20%) sets, and wrap them around a torch.utils.data.DataLoader object. With its intuitive syntax, DataLoader provides an iterable over the given dataset.

val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

print(train_idx.shape, val_idx.shape)


(4459,) (1115,)


In [127]:
# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

train_set

<torch.utils.data.dataset.TensorDataset at 0x1dd170a7880>

In [128]:
# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x1dd5b0f95a0>

In [133]:
# Loop over the DataLoader
for batch in train_dataloader:
    b_input_ids, b_input_mask, b_labels = batch
    print("Input IDs:", b_input_ids)
    print("Attention Masks:", b_input_mask)
    print("Labels:", b_labels)
    
    # Break after the first batch for brevity
    break

Input IDs: tensor([[  101,  8840,  2140,  2085,  1045,  1005,  1049,  2044,  2008,  2980,
          2250, 13212,   999,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  2305,  2305,  1010,  2156,  2017,  4826,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101, 13433, 19903, 23778, 20689,  1010, 12849, 20051,  6979, 19636,
         14852,  1010,  4895, 25787,  3126,  2695,  1010,  9413, 10244,  4487,
          2015,  1010,  1004,  8318,  1025,  1001,  1004, 14181,  1025,  1012,
           102,     0],
        [  101,  2129,  2003,  2115,  6134,  2279,  2733,  1029,  1045,  2572,
          2041,  1997,  2237,  2023,  5353,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     

In [129]:
def b_tp(preds, labels):
    '''Returns True Positives (TP): count of correct predictions of actual class 1'''
    return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
    '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
    return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
    '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
    return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
    '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
    return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
    '''
    Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
    '''
    preds = np.argmax(preds, axis = 1).flatten()
    labels = labels.flatten()
    tp = b_tp(preds, labels)
    tn = b_tn(preds, labels)
    fp = b_fp(preds, labels)
    fn = b_fn(preds, labels)
    b_accuracy = (tp + tn) / len(labels)
    b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
    b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
    b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
    return b_accuracy, b_precision, b_recall, b_specificity

In [130]:
# Load the BertForSequenceClassification model
# This code snippet is initializing the BertForSequenceClassification model with the bert-base-uncased pre-trained weights.
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
'''
model.parameters(): This is passing all of the model’s parameters to the optimizer, so they can be updated during training.
lr = 5e-5: This is the learning rate, which controls how much to change the model in response to the estimated error each time the model weights are updated.
eps = 1e-08: This is a very small number to prevent any division by zero in the implementation (also known as epsilon).
'''
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [131]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
'''
Training and validation
'''

In [134]:

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

# trange: This is a function from the tqdm module, which is a fast, extensible progress bar for Python. It’s used here to display a progress bar for the epochs in the console.
for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    # model.train(): This sets the model to training mode. This is important because some layers like dropout and batch normalization behave differently during training and evaluation.
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad() # This clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls)
        # Forward pass
        # train_output = model(...): This performs a forward pass through the model, which computes predictions based on the input ids, attention masks, and labels.
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward() # This computes the gradient of the loss with respect to the model parameters.
        optimizer.step() #  This updates the model parameters based on the current gradient.
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # By requiring explicit calls to these functions, PyTorch gives you more control over the training process, which can be very helpful for implementing more complex training regimes.
        
    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader: # This starts a loop over the validation data. Each batch is a tuple containing input ids, attention masks, and labels for a batch of examples.
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad(): # This context manager tells PyTorch not to construct the computation graph during this block, which reduces memory consumption and speeds up computation. The computation graph is used for automatic differentiation, which is essential for backpropagation during training. However, during evaluation, we’re not updating the model’s parameters, so we don’t need to compute gradients.
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy() # detach the logits from the computation graph, move them to the CPU, and convert them to numpy arrays. The reason we move logits and labels to the CPU is because many operations in PyTorch, especially those that involve converting to numpy or native Python types, are not supported on the GPU.
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

Epoch:  50%|████████████████████████████████████████████████████████████▌                                                            | 1/2 [00:28<00:28, 28.06s/it]


	 - Train loss: 0.0953
	 - Validation Accuracy: 0.9732
	 - Validation Precision: 0.8709
	 - Validation Recall: 0.9627
	 - Validation Specificity: 0.9724



Epoch: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:55<00:00, 27.84s/it]


	 - Train loss: 0.0322
	 - Validation Accuracy: 0.9893
	 - Validation Precision: 0.9874
	 - Validation Recall: 0.9279
	 - Validation Specificity: 0.9980






In [None]:
'''
Predicting
'''

In [148]:
# new_sentence = 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

new_sentence = 'call me at 123343453 to give you your prize of one million euro'


# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)
# These lines are concatenating the lists of input IDs and attention masks into PyTorch tensors. The dim = 0 argument means that the tensors are concatenated along the first dimension (i.e., they are stacked vertically).
# After this snippet, test_ids and test_attention_mask should be PyTorch tensors ready to be fed into your model.

# Forward pass, calculate logit predictions
with torch.no_grad():
    output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Ham'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)
print('-------------')
print(output)
print(output.logits)
print(output.logits.cpu().numpy())
print(np.argmax(output.logits.cpu().numpy()))
print(np.argmax(output.logits.cpu().numpy()).flatten())
print(np.argmax(output.logits.cpu().numpy()).flatten().item())


Input Sentence:  call me at 123343453 to give you your prize of one million euro
Predicted Class:  Spam
-------------
SequenceClassifierOutput(loss=None, logits=tensor([[-1.8679,  2.1930]], device='cuda:0'), hidden_states=None, attentions=None)
tensor([[-1.8679,  2.1930]], device='cuda:0')
[[-1.8678647  2.1930404]]
1
[1]
1
