In [1]:
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup, DataCollatorWithPadding
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np

2024-04-13 10:54:30.839496: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-13 10:54:30.839596: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-13 10:54:30.965093: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# !pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [3]:
df = pd.read_csv("/kaggle/input/big-reddit/mbti_full_pull.csv", encoding='latin1')

df.describe()

Unnamed: 0,author_flair_text,body,subreddit
count,1794016,1793949,1794016
unique,8702,1746609,520
top,INTP,Yes.,INTP
freq,365646,677,419700


In [4]:
# drop rows with author_flair_text that does not contain any of the 16 personalities
df = df[df['author_flair_text'].str.contains('ISTJ|ISFJ|INFJ|INTJ|ISTP|ISFP|INFP|INTP|ESTP|ESFP|ENFP|ENTP|ESTJ|ESFJ|ENFJ|ENTJ', na=False)]

# remove extra characters other than 16 personality types from author_flair_text
df['author_flair_text'] = df['author_flair_text'].str.extract(r'(ISTJ|ISFJ|INFJ|INTJ|ISTP|ISFP|INFP|INTP|ESTP|ESFP|ENFP|ENTP|ESTJ|ESFJ|ENFJ|ENTJ)')


# drop subreddit column 
df = df.drop('subreddit', axis=1)
# rename author_flair_text to label
df = df.rename(columns={'author_flair_text': 'label'})

# make histogram of frequency of each personality type
# df['label'].value_counts().plot(kind='bar')

In [5]:
# df.body.str.split().str.len().hist()
# print('average sentence length: ', df.body.str.split().str.len().mean())
# print('stdev sentence length: ', df.body.str.split().str.len().std())

In [6]:
df.describe()

Unnamed: 0,label,body
count,1662463,1662399
unique,16,1619505
top,INTP,Yes.
freq,455852,641


In [7]:
df['IE'] = df['label'].apply(lambda x: x[0])
df['NS'] = df['label'].apply(lambda x: x[1])
df['TF'] = df['label'].apply(lambda x: x[2])
df['JP'] = df['label'].apply(lambda x: x[3])
# apply label encoding to the 4 columns
label_encoders = {}
for col in ['IE', 'NS', 'TF', 'JP']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

print(f"Count of {label_encoders['IE'].inverse_transform([1])} in IE: ", df['IE'].sum())
print(f"Count of {label_encoders['IE'].inverse_transform([0])} in IE: ", df['IE'].eq(0).sum())
print("")
print(f"Count of {label_encoders['NS'].inverse_transform([1])} in NS: ", df['NS'].sum())
print(f"Count of {label_encoders['NS'].inverse_transform([0])} in NS: ", df['NS'].eq(0).sum())
print("")
print(f"Count of {label_encoders['TF'].inverse_transform([1])} in TF: ", df['TF'].sum())
print(f"Count of {label_encoders['TF'].inverse_transform([0])} in TF: ", df['TF'].eq(0).sum())
print("")
print(f"Count of {label_encoders['JP'].inverse_transform([1])} in JP: ", df['JP'].sum())
print(f"Count of {label_encoders['JP'].inverse_transform([0])} in JP: ", df['JP'].eq(0).sum())
df.columns

Count of ['I'] in IE:  1275956
Count of ['E'] in IE:  386507

Count of ['S'] in NS:  113965
Count of ['N'] in NS:  1548498

Count of ['T'] in TF:  1140515
Count of ['F'] in TF:  521948

Count of ['P'] in JP:  1010869
Count of ['J'] in JP:  651594


Index(['label', 'body', 'IE', 'NS', 'TF', 'JP'], dtype='object')

In [8]:
# drop 500 000 rows having 0 in ns column randomly
df = df.drop(df[(df['IE'] == 1) & (df['NS'] == 0)].sample(n=1100000, random_state=1).index)

print("")
print(f"Count of {label_encoders['IE'].inverse_transform([1])} in IE: ", df['IE'].sum())
print(f"Count of {label_encoders['IE'].inverse_transform([0])} in IE: ", df['IE'].eq(0).sum())
print("")
print(f"Count of {label_encoders['NS'].inverse_transform([1])} in NS: ", df['NS'].sum())
print(f"Count of {label_encoders['NS'].inverse_transform([0])} in NS: ", df['NS'].eq(0).sum())
print("")
print(f"Count of {label_encoders['TF'].inverse_transform([1])} in TF: ", df['TF'].sum())
print(f"Count of {label_encoders['TF'].inverse_transform([0])} in TF: ", df['TF'].eq(0).sum())
print("")
print(f"Count of {label_encoders['JP'].inverse_transform([1])} in JP: ", df['JP'].sum())
print(f"Count of {label_encoders['JP'].inverse_transform([0])} in JP: ", df['JP'].eq(0).sum())


Count of ['I'] in IE:  175956
Count of ['E'] in IE:  386507

Count of ['S'] in NS:  113965
Count of ['N'] in NS:  448498

Count of ['T'] in TF:  386688
Count of ['F'] in TF:  175775

Count of ['P'] in JP:  424982
Count of ['J'] in JP:  137481


In [9]:
types = df['label'].unique()
types = [t.lower() for t in types]
df['body'] = df['body'].astype(str)

# count number of occurences of each type in each row in body column
df['type_count'] = df['body'].apply(lambda x: [x.count(t) for t in types])

# number of rows with 0 count for all types
print('Number of rows with 0 count for all types: ', df['type_count'].apply(lambda x: sum(x) == 0).sum())

# number of rows with 1 count for all types
print('Number of rows with 1 count for all types: ', df['type_count'].apply(lambda x: sum(x) > 0).sum())
df['type_count']

Number of rows with 0 count for all types:  549301
Number of rows with 1 count for all types:  13162


11         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
12         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
13         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
15         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
22         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                                 ...                       
1793995    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1794005    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1794007    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1794008    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1794013    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: type_count, Length: 562463, dtype: object

In [10]:
df.describe()

Unnamed: 0,IE,NS,TF,JP
count,562463.0,562463.0,562463.0,562463.0
mean,0.312831,0.202618,0.687491,0.755573
std,0.463647,0.40195,0.463517,0.429747
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,1.0,1.0
75%,1.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0


In [11]:
# drop rows with more than 0 count for all types
df = df[df['type_count'].apply(lambda x: sum(x) == 0)]
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,label,body,IE,NS,TF,JP,type_count
0,ENTP,They're much easier to understand if you consi...,0,0,1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,ENFP,is this at Terminal 5 by any chance!?,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,ENTJ,"Well consider this, Freud is considered the fa...",0,0,1,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,ENFP,"Affirmation, quality/touch",0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,ENFP,Just the first ones.,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [12]:
df.describe()

Unnamed: 0,IE,NS,TF,JP
count,549301.0,549301.0,549301.0,549301.0
mean,0.31381,0.201904,0.686502,0.754097
std,0.46404,0.401421,0.463915,0.430622
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,1.0,1.0
75%,1.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0


In [13]:
label_cols = ['IE', 'NS', 'TF', 'JP']

df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,label,body,IE,NS,TF,JP,type_count,one_hot_labels
0,ENTP,They're much easier to understand if you consi...,0,0,1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 1, 1]"
1,ENFP,is this at Terminal 5 by any chance!?,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1]"
2,ENTJ,"Well consider this, Freud is considered the fa...",0,0,1,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 1, 0]"
3,ENFP,"Affirmation, quality/touch",0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1]"
4,ENFP,Just the first ones.,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1]"


In [14]:
labels = list(df.one_hot_labels.values)
text = list(df.body.values)

In [15]:
train_text, test_text, train_labels, test_labels= train_test_split(text, labels, random_state=42, test_size=0.30)
test_text, val_text, test_labels, val_labels = train_test_split(test_text, test_labels, random_state=42, test_size=0.50)
print(len(train_text), len(val_text), len(test_text))

384510 82396 82395


In [16]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        if len(text) >= 512:
            encoding = self.tokenizer(text, return_tensors='pt', truncation=True, max_length = 512, padding='max_length')
        else:
            encoding = self.tokenizer(text, return_tensors='pt', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label)}

In [17]:
batch_size = 4
checkpoint = "FacebookAI/roberta-base"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataset = TextClassificationDataset(train_text, train_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

validation_dataset = TextClassificationDataset(val_text, val_labels, tokenizer)
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

test_dataset = TextClassificationDataset(test_text, test_labels, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [18]:
batch = next(iter(train_dataloader))
print({k: v.shape for k, v in batch.items()})

{'input_ids': torch.Size([4, 512]), 'attention_mask': torch.Size([4, 512]), 'labels': torch.Size([4, 4])}


In [19]:
model = RobertaForSequenceClassification.from_pretrained(checkpoint, num_labels=4)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask']).logits

tensor([[-0.1537,  0.0301,  0.0487,  0.1535],
        [-0.1618,  0.0274,  0.0388,  0.1573],
        [-0.1628,  0.0327,  0.0445,  0.1459],
        [-0.1527,  0.0371,  0.0280,  0.1570]], grad_fn=<AddmmBackward0>)

In [21]:
torch.cuda.is_available()

True

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [23]:
epochs = 2
num_labels = 4

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = AdamW(model.parameters(), lr = 1e-5)    
total_steps = len(train_dataloader) * epochs
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0.08 * total_steps, num_training_steps = total_steps)



In [24]:
from tqdm.auto import tqdm
from tqdm import trange
import evaluate


train_loss_set = []
val_loss_set = []

progress_bar = tqdm(range(total_steps))

accuracy_metric = evaluate.load("accuracy")
best_val_loss = float('inf')
early_stopping_patience = 2


for epoch_i in trange(epochs, desc="Epoch"):

  # Training
  
  model.train()

  tr_loss = 0 
  nb_tr_examples, nb_tr_steps = 0, 0
  step = 0
  
  for batch in train_dataloader:

    if step % 40 == 0 and not step == 0:            
      print('  Batch {:>5,}  of  {:>5,}. in epoch {:>5,}'.format(step, len(train_dataloader), epoch_i))
    step+=1

    batch = {k: v.to(device) for k, v in batch.items()}
    b_input_ids = batch['input_ids']
    b_input_mask = batch['attention_mask']
    b_labels = batch['labels']
    optimizer.zero_grad()

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, attention_mask=b_input_mask)
    logits = outputs[0]
    loss = loss_fn(logits.view(-1,num_labels), b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    
    loss.backward()
    optimizer.step()
    lr_scheduler.step()

    progress_bar.update(1)
    
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  model.eval()

  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]
  val_loss = 0
  nb_val_steps, nb_val_examples = 0, 0
  eval_steps = 0

  # Predict
  for batch in validation_dataloader:

    if eval_steps % 40 == 0 and not eval_steps == 0:
      print(' Eval Batch {:>5,}  of  {:>5,}.'.format(eval_steps, len(validation_dataloader)))
    eval_steps +=1

    batch = {k: v.to(device) for k, v in batch.items()}
    b_input_ids = batch['input_ids']
    b_input_mask = batch['attention_mask']
    b_labels = batch['labels']
    
    with torch.no_grad():
      # Forward pass
      outs = model(b_input_ids, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      loss = loss_fn(outs[0].view(-1,num_labels), b_labels.type_as(outs[0]).view(-1,num_labels)) #convert labels to float for calculation
      val_loss_set.append(loss.item())

      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)
      predictions = (pred_label >= 0.5).int().reshape(-1)
    
      accuracy_metric.add_batch(predictions=predictions, references=b_labels.int().reshape(-1))

      # b_logit_pred = b_logit_pred.detach().cpu().numpy()
      # pred_label = pred_label.to('cpu').numpy()
      # b_labels = b_labels.to('cpu').numpy() 

    val_loss += loss.item()
    nb_val_examples += b_input_ids.size(0)
    nb_val_steps += 1

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)
  
  print("Validation loss: {}".format(val_loss/nb_val_steps))

  # # Flatten outputs
  # pred_labels = [item for sublist in pred_labels for item in sublist]
  # true_labels = [item for sublist in true_labels for item in sublist]

  # # Calculate Accuracy
  # threshold = 0.50
  # pred_bools = [pl>threshold for pl in pred_labels]
  # true_bools = [tl==1 for tl in true_labels]
  # val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
  # val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100
  avg_val_loss = val_loss/nb_val_steps
  if avg_val_loss < best_val_loss:
    best_val_loss = avg_val_loss
    epochs_without_improvement = 0
    # Save the best model
    torch.save(model.state_dict(), "/kaggle/working/big-reddit-roberta-model.pth")
  else:
    epochs_without_improvement += 1
    if epochs_without_improvement >= early_stopping_patience:
      print(f'Early stopping triggered after {epoch_i} epochs without improvement.')
      break

  val_flat_accuracy = accuracy_metric.compute()

  # print('F1 Validation Accuracy: ', val_f1_accuracy)
  print('Validation Accuracy: ', val_flat_accuracy)

  0%|          | 0/192256 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]


Epoch:   0%|          | 0/2 [00:00<?, ?it/s][A

  Batch    40  of  96,128. in epoch     0
  Batch    80  of  96,128. in epoch     0
  Batch   120  of  96,128. in epoch     0
  Batch   160  of  96,128. in epoch     0
  Batch   200  of  96,128. in epoch     0
  Batch   240  of  96,128. in epoch     0
  Batch   280  of  96,128. in epoch     0
  Batch   320  of  96,128. in epoch     0
  Batch   360  of  96,128. in epoch     0
  Batch   400  of  96,128. in epoch     0
  Batch   440  of  96,128. in epoch     0
  Batch   480  of  96,128. in epoch     0
  Batch   520  of  96,128. in epoch     0
  Batch   560  of  96,128. in epoch     0
  Batch   600  of  96,128. in epoch     0
  Batch   640  of  96,128. in epoch     0
  Batch   680  of  96,128. in epoch     0
  Batch   720  of  96,128. in epoch     0
  Batch   760  of  96,128. in epoch     0
  Batch   800  of  96,128. in epoch     0
  Batch   840  of  96,128. in epoch     0
  Batch   880  of  96,128. in epoch     0
  Batch   920  of  96,128. in epoch     0
  Batch   960  of  96,128. in epoc


Epoch:  50%|█████     | 1/2 [4:10:31<4:10:31, 15031.71s/it][A

Validation Accuracy:  {'accuracy': 0.770134472547211}
  Batch    40  of  96,128. in epoch     1
  Batch    80  of  96,128. in epoch     1
  Batch   120  of  96,128. in epoch     1
  Batch   160  of  96,128. in epoch     1
  Batch   200  of  96,128. in epoch     1
  Batch   240  of  96,128. in epoch     1
  Batch   280  of  96,128. in epoch     1
  Batch   320  of  96,128. in epoch     1
  Batch   360  of  96,128. in epoch     1
  Batch   400  of  96,128. in epoch     1
  Batch   440  of  96,128. in epoch     1
  Batch   480  of  96,128. in epoch     1
  Batch   520  of  96,128. in epoch     1
  Batch   560  of  96,128. in epoch     1
  Batch   600  of  96,128. in epoch     1
  Batch   640  of  96,128. in epoch     1
  Batch   680  of  96,128. in epoch     1
  Batch   720  of  96,128. in epoch     1
  Batch   760  of  96,128. in epoch     1
  Batch   800  of  96,128. in epoch     1
  Batch   840  of  96,128. in epoch     1
  Batch   880  of  96,128. in epoch     1
  Batch   920  of  96,


Epoch: 100%|██████████| 2/2 [8:22:28<00:00, 15074.09s/it]

Validation Accuracy:  {'accuracy': 0.7731382591387932}





In [25]:
import evaluate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

accuracy_metric = evaluate.load("accuracy")
model.eval()
step = 0
for batch in test_dataloader:
    
    if step % 40 == 0 and not step == 0:
        print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(test_dataloader)))
    step+=1
    
    batch = {k: v.to(device) for k, v in batch.items()}
    b_input_ids = batch['input_ids']
    b_input_mask = batch['attention_mask']
    b_labels = batch['labels']
    with torch.no_grad():
        outs = model(b_input_ids, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)
    predictions = (pred_label >= 0.5).int().reshape(-1)
    
    accuracy_metric.add_batch(predictions=predictions, references=b_labels.int().reshape(-1))

accuracy_metric.compute()

  Batch    40  of  20,599.
  Batch    80  of  20,599.
  Batch   120  of  20,599.
  Batch   160  of  20,599.
  Batch   200  of  20,599.
  Batch   240  of  20,599.
  Batch   280  of  20,599.
  Batch   320  of  20,599.
  Batch   360  of  20,599.
  Batch   400  of  20,599.
  Batch   440  of  20,599.
  Batch   480  of  20,599.
  Batch   520  of  20,599.
  Batch   560  of  20,599.
  Batch   600  of  20,599.
  Batch   640  of  20,599.
  Batch   680  of  20,599.
  Batch   720  of  20,599.
  Batch   760  of  20,599.
  Batch   800  of  20,599.
  Batch   840  of  20,599.
  Batch   880  of  20,599.
  Batch   920  of  20,599.
  Batch   960  of  20,599.
  Batch 1,000  of  20,599.
  Batch 1,040  of  20,599.
  Batch 1,080  of  20,599.
  Batch 1,120  of  20,599.
  Batch 1,160  of  20,599.
  Batch 1,200  of  20,599.
  Batch 1,240  of  20,599.
  Batch 1,280  of  20,599.
  Batch 1,320  of  20,599.
  Batch 1,360  of  20,599.
  Batch 1,400  of  20,599.
  Batch 1,440  of  20,599.
  Batch 1,480  of  20,599.
 

{'accuracy': 0.7731870865950604}