In [109]:
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast
import pandas as pd
import random
from torch.utils.data import DataLoader
import torch

In [113]:
class nluDataset(Dataset):

    def __init__(self, file_dir, tokenizer, max_len, device):
        
        self.data = pd.read_csv(file_dir, sep='\t')
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(tokenizer)
        self.max_len = max_len
    
    def processSlotLabel(self,word_ids,slot_ids):
        
        slot_ids = list(map(int, slot_ids.split(' ')))   
        new_labels = [idx if idx!=None else -100 for idx in word_ids]
        
        
        previous_word_idx = None
        #print(word_ids,slot_ids)
        for idx,_ in enumerate(new_labels[1:]):
            
            if _ == -100:
                continue
            if _ == previous_word_idx:
                new_labels[idx+1] = -100
            
            previous_word_idx = _
                
        return new_labels  
        

    def __getitem__(self, index):
        
        text = str(self.data.TEXT[index])
        text = " ".join(text.split())
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            return_token_type_ids=False,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            
            #is_split_into_words=True
        )
        
       # print(self.tokenizer.convert_ids_to_tokens(inputs["input_ids"]),inputs.word_ids())
        # text encoding
        token_ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        word_ids = inputs.word_ids()

        # intent
        intent_id = torch.tensor(self.data.INTENT_ID[index], dtype=torch.long)
        intent_label = self.data.INTENT[index]

        # label processing
        slot_label = self.data.SLOTS[index]
        slot_id = self.processSlotLabel(word_ids,self.data.SLOTS_ID[index])
    
        slot_id = torch.tensor(slot_id,dtype=torch.long)
        

        #language = self.data.language[index]
        
        return {
            'token_ids': token_ids,
            'mask': mask,
            'intent_id': intent_id,
            'slots_id' : slot_id,
            'intent_label': intent_label,
            'slots_label' : slot_label
        } 
    
    def __len__(self):
        return len(self.data)

In [114]:
ds = nluDataset(file_dir='../data/multiATIS/splits/train_EN.tsv', tokenizer='distilbert-base-multilingual-cased', max_len=56, device=1)

In [115]:
dl = DataLoader(ds,batch_size=32)

In [116]:
for batch in dl:
    a=1

In [33]:
sample = ds[0]

['[CLS]', 'how', 'much', 'does', 'the', 'american', 'airline', '##s', 'flight', '71', 'from', 'dalla', '##s', 'to', 'san', 'franc', '##is', '##co', 'cost', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'] [None, 0, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9, 10, 11, 12, 12, 12, 13, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]


In [34]:
sample['token_ids'],len(sample['token_ids'])

(tensor([  101, 14796, 13172, 15107, 10105, 50513, 67878, 10107, 23578, 12513,
         10188, 11353, 10107, 10114, 14608, 63184, 10291, 10812, 18849,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]),
 56)

In [35]:
sample['mask'],len(sample['mask'])

(tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 56)

In [36]:
sample['slots_id'],len(sample['slots_id'])

([-100,
  24,
  24,
  24,
  24,
  65,
  66,
  66,
  24,
  125,
  24,
  79,
  79,
  24,
  73,
  74,
  74,
  74,
  24,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100],
 56)

In [52]:
a = [1,2,3,4,5,6]
for idx,_ in enumerate(a[1:]):
    print(_)


2
3
4
5
6
