# Inference Demo BERT

In [None]:
# Inference Demo Code - Solution C - BERT

# Please install the following libraries before running the code
# !pip install torch
# !pip install transformers
# !pip install pandas

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import re
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification

MAX_LEN = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Text cleaning
def clean_text(text):
  text = re.sub(r"can't\b", "cannot", text)
  text = re.sub(r"won't\b", "will not", text)
  text = re.sub(r"n't\b", " not", text)
  text = re.sub(r"'re\b", " are", text)
  text = re.sub(r"'m\b", " am", text)
  text = re.sub(r"'ve\b", " have", text)
  text = re.sub(r"'ll\b", " will", text)
  text = re.sub(r"'d\b", " would", text)
  text = re.sub(r"\b(he|she|it|that|what|who|there|where|why|when)'s\b", r"\1 is", text, flags=re.IGNORECASE)
  text = re.sub(r"http\S+|www\S+|https\S+", "[URL]", text)
  text = re.sub(r"\s+", " ", text).strip()
  return text

# Custom Dataset class
class EDDataset(Dataset):
  def __init__(self, texts, labels=None, tokenizer=None):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer or BertTokenizer.from_pretrained('bert-base-uncased')

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, index):
    encoding = self.tokenizer(
        self.texts[index],
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    item = {
        'input_ids': encoding['input_ids'].squeeze(0),
        'attention_mask': encoding['attention_mask'].squeeze(0)
    }
    if self.labels is not None:
      item['labels'] = torch.tensor(self.labels[index], dtype=torch.long)
    return item


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Prediction Function

In [None]:
def predict(model, dataloader, device):
  model.eval()
  all_logits = []
  # Turn off gradient tracking to speed up prediction and save memory
  with torch.no_grad():
    for batch in dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      logits = outputs.logits
      all_logits.append(logits)
  return torch.cat(all_logits, dim=0)

# Run Prediction

In [None]:
if __name__ == '__main__':
  model_dir = '/content/drive/My Drive/nlu-lab/bert_model'
  model_path = os.path.join(model_dir, 'best_model_bert.pt')
  best_param_path = os.path.join(model_dir, 'best_params_bert.json')

  # Load best hyperparameters
  with open(best_param_path, 'r') as f:
    params = json.load(f)

  # Load the fine-tuned model
  model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
  model.load_state_dict(torch.load(model_path, map_location=device))
  model.to(device)
  model.eval()

  # Load test data (e.g., dev-nonlabel.csv)
  test_file_path = '/content/drive/My Drive/nlu-lab/test.csv'
  test_df = pd.read_csv(test_file_path)

  # Clean and tokenize input
  test_df['combined_text'] = (test_df['Claim'] + " " + test_df['Evidence']).apply(clean_text)
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  test_dataset = EDDataset(test_df['combined_text'].tolist(), tokenizer=tokenizer)
  test_dataloader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

  # Perform prediction
  test_logits = predict(model, test_dataloader, device)
  test_pred = torch.argmax(test_logits, dim=1)

  # Save predictions to CSV
  output_path = os.path.join(model_dir, 'Group_21_C.csv')
  results = pd.DataFrame({'prediction': test_pred.cpu().numpy()})
  results.to_csv(output_path, index=False)
  print(f"Results saved to: {output_path}")
  print(results.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results saved to: /content/drive/My Drive/nlu-lab/bert_model/Group_21_C.csv
   prediction
0           1
1           0
2           0
3           0
4           0
