In [57]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoModel, BertTokenizer
from transformers import Trainer, TrainingArguments, HfArgumentParser
from datasets import Dataset, IterableDataset
from torch.utils.data import DataLoader

In [58]:
RANDOM_SEED = 142
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
PRE_TRAINED_MODEL_NAME = 'deepset/gbert-base'
model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
tokenizer = BertTokenizer.from_pretrained(
    PRE_TRAINED_MODEL_NAME,
    use_fast=True
    )
BATCH_SIZE = 8
MAX_LEN = 512


Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [62]:
class QnADataset(Dataset):
    def __init__(self, question, answer, targets, tokenizer, max_length):
        self.question = question
        self.answer = answer
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = int(max_length)
        self.doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
        #self.q_len = int(q_len)
        #self.a_len = int(a_len)
    
    def __len__(self):
        return len(self.answer)

    def __getitem__(self, item):
        question = str(self.question[item])
        context = str(self.answer[item])
        target = self.targets[item]
        
        encoding = tokenizer(
            question,
            context,
            max_length = self.max_length,
            add_special_tokens = True,
            padding='max_length',
            truncation='only_second',
            return_attention_mask = True,
            return_token_type_ids = False,
            return_tensors = 'pt',
            #return_overflowing_tokens=True,
            #return_offsets_mapping=True,
            #stride=self.doc_stride
        )
        return {
            #'question_text': question,
            #'answer_text': answer,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [63]:
data = pd.read_csv('../data/faq_info_labels.csv')
ds = QnADataset(data['question'], data['answer'], data['matching'], tokenizer, MAX_LEN)
dataloader = DataLoader(ds, batch_size=32, num_workers=4, shuffle=True)

TypeError: __init__() missing 2 required positional arguments: 'info' and 'split'

In [61]:
for batch in dataloader:
    print(batch) 

BrokenPipeError: [Errno 32] Broken pipe