In [2]:
import os
import ast
import numpy as np
import pandas as pd
import torch

from transformers import BertTokenizer, BertForSequenceClassification


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
test_df = pd.read_csv("../data/raw/test.csv")
test_df.head()


Unnamed: 0,id,prompt,response_a,response_b
0,136060,"[""I have three oranges today, I ate an orange ...","[""You have two oranges today.""]","[""You still have three oranges. Eating an oran..."
1,211333,"[""You are a mediator in a heated political deb...","[""Thank you for sharing the details of the sit...","[""Mr Reddy and Ms Blue both have valid points ..."
2,1233961,"[""How to initialize the classification head wh...","[""When you want to initialize the classificati...","[""To initialize the classification head when p..."


unwrap å’Œ unicode

In [4]:
def unwrap(x):
    if isinstance(x, str):
        try:
            x = ast.literal_eval(x)
            if isinstance(x, list):
                return " ".join(x)
        except:
            pass
    return x


In [5]:
test_df["prompt_text"] = test_df["prompt"].apply(unwrap)

test_df["input_text"] = (
    "Prompt: " + test_df["prompt_text"] +
    " Response A: " + test_df["response_a"] +
    " Response B: " + test_df["response_b"]
)


In [6]:
def clean_unicode(text):
    if isinstance(text, str):
        return text.encode("utf-8", errors="ignore").decode("utf-8")
    return text

test_df["input_text"] = test_df["input_text"].apply(clean_unicode)


In [7]:
print(test_df["input_text"][0][:300])


Prompt: I have three oranges today, I ate an orange yesterday. How many oranges do I have? Response A: ["You have two oranges today."] Response B: ["You still have three oranges. Eating an orange yesterday does not affect the number of oranges you have today."]


In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

test_encodings = tokenizer(
    test_df["input_text"].tolist(),
    truncation=True,
    padding=True,
    max_length=512,
)


In [9]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}

    def __len__(self):
        return len(self.encodings["input_ids"])


In [10]:
test_dataset = TestDataset(test_encodings)


In [11]:
model = BertForSequenceClassification.from_pretrained(
    "../notebooks/results/best_model"
)
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
device = torch.device("cpu")
model.to(device)

all_logits = []

with torch.no_grad():
    for batch in torch.utils.data.DataLoader(
        test_dataset, batch_size=4
    ):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        all_logits.append(outputs.logits.cpu())


In [13]:
logits = torch.cat(all_logits, dim=0).numpy()


In [14]:
probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()


In [15]:
submission_df = pd.DataFrame(
    probs,
    columns=[
        "prob_response_a_wins",
        "prob_response_b_wins",
        "prob_tie",
    ],
)

submission_df["id"] = test_df["id"]

submission_df = submission_df[
    ["id", "prob_response_a_wins", "prob_response_b_wins", "prob_tie"]
]


In [16]:
output_dir = "../outputs/predictions"
os.makedirs(output_dir, exist_ok=True)

submission_path = os.path.join(output_dir, "submission.csv")
submission_df.to_csv(submission_path, index=False)

submission_path


'../outputs/predictions\\submission.csv'