In [1]:
from datasets import load_dataset
import pandas as pd

datasets = load_dataset('glue', 'qnli')

train_data = pd.DataFrame(datasets['train']).drop(columns = ['idx'])
validation_data = pd.DataFrame(datasets['validation']).drop(columns = ['idx'])
test_data = pd.DataFrame(datasets['test']).drop(columns = ['idx'])

print(datasets)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 104743
    })
    validation: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
    test: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
})


In [2]:
train_questions, train_sentences, train_labels = train_data['question'][0:25000].tolist(), train_data['sentence'][0:25000].tolist(), train_data['label'][0:25000].tolist()
eval_question, eval_sentences, eval_labels = validation_data['question'][0:3000].tolist(), validation_data['sentence'][0:3000].tolist(), validation_data['label'][0:3000].tolist()
# 將dataframe格式轉成list(tokenize時使用)

print(len(train_questions))
print(type(train_questions))

25000
<class 'list'>


In [17]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', model_max_length=128)

train_encodings = tokenizer(train_questions, train_sentences, padding='max_length', truncation=True)
eval_encodings = tokenizer(eval_question, eval_sentences, padding='max_length', truncation=True)

def add_targets(encodings, label):
  encodings.update({'label': label})

add_targets(train_encodings, train_labels)
add_targets(eval_encodings, eval_labels)

print(train_encodings.keys())


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'label'])


In [4]:
from torch.utils import data
import torch

class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

  def __len__(self):
    return len(self.encodings.input_ids)



In [14]:
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
import torch

model_name = 'bert-base-cased'
model_path = './../bert-QNLI-pretrained.bin'
# tokenizer = BertTokenizer.from_pretrained(model_name)
config = BertConfig.from_pretrained("./../bert-QNLI-config.json", num_labels=2) 
model = BertForSequenceClassification.from_pretrained(model_path, config=config)
# model = BertForSequenceClassification.from_pretrained(
#                                     model_name, 
#                                     state_dict=torch.load(model_path, map_location=torch.device('cpu')), 
#                                     num_labels=2)


In [None]:

model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

In [18]:
import logging
import datasets
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader
from tqdm.auto import tqdm, trange
import math

import transformers
from accelerate import Accelerator
from transformers import (
    AdamW,
    AutoConfig,
    default_data_collator,
    get_scheduler
)

train_batch_size, eval_batch_size = 10, 10

data_collator = default_data_collator

train_dataset = Dataset(train_encodings)
eval_dataset = Dataset(eval_encodings)

train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=train_batch_size)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=eval_batch_size)

In [19]:
sentence1 = "This is the first sentence."
sentence2 = "This is the second sentence."
inputs = tokenizer(sentence1, sentence2, return_tensors='pt', padding=True, truncation=True)


# model.eval()
# with torch.no_grad():
#     outputs = model(**inputs)
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     print(f"Predicted label: {predictions.item()}")
    
print("***** Running eval *****")
model.eval()

labels = []
predictions = []

for step, batch in enumerate(tqdm(eval_dataloader, desc="Eval Iteration")):
  outputs = model(**batch)
  predicted = outputs.logits.argmax(dim=-1)

  labels += batch["labels"].tolist()
  predictions += predicted.tolist()


***** Running eval *****


Eval Iteration:   3%|▎         | 8/300 [00:24<15:07,  3.11s/it]


KeyboardInterrupt: 