In [40]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit
from transformers import PreTrainedTokenizerFast
from torch.utils.data import Dataset

from tqdm import tqdm

In [41]:
class TextDataset(Dataset):
    def __init__(self, file_path):
        self.examples = []

        with open(file_path, encoding='utf-8') as f:
            i = 0
            for line in tqdm(f, desc="Loading Dataset"):
                self.examples.append(line.strip())
                i += 1

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

In [42]:
def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i: i + 1000]

In [43]:
predefined_tokenizer = Tokenizer(model=WordLevel(unk_token="[UNK]"))
predefined_tokenizer.pre_tokenizer = WhitespaceSplit()

In [44]:
dataset = TextDataset('data/Tokyo2008PTChainSummary.txt')

trainer = WordLevelTrainer()
predefined_tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
predefined_tokenizer.get_vocab()

Loading Dataset: 576806it [00:00, 762834.45it/s]


{'Delivering': 33,
 'Housewife': 18,
 'Back_Home': 3,
 'Private_Space': 5,
 'Traffic_Worker': 51,
 'Business_Place': 4,
 'Go_Eat': 15,
 '5': 41,
 'Go_Occupation': 38,
 '50': 35,
 'Shopping_Daily': 12,
 '30': 29,
 'Office': 1,
 'Pickup_Drop_Off': 31,
 '25': 37,
 'Socializing': 7,
 'Technical_Worker': 19,
 'Store_Daily': 8,
 'Go_Other_Business': 24,
 'Security_Worker': 55,
 'Student': 17,
 'Service_Worker': 28,
 '20': 42,
 '15': 43,
 'Attend_Meeting': 21,
 'nan': 54,
 '80': 48,
 'Unemployed': 16,
 'House': 0,
 'Go_Sightseeing': 34,
 'Tourist_Spot': 13,
 'Unclear': 53,
 'Go_School': 11,
 'Private_Movement': 14,
 'Labor_Worker': 47,
 'Office_Worker': 22,
 '65': 30,
 'Managerial_Worker': 39,
 'Female': 9,
 'Welcoming': 20,
 '10': 40,
 'Other_Occupation': 49,
 '75': 44,
 '60': 26,
 'Sales_Worker': 45,
 '35': 23,
 'Male': 10,
 '45': 32,
 'Agriculture_Worker': 52,
 'School': 2,
 'Go_Agricultural_Work': 56,
 'Commute': 6,
 '55': 27,
 'Natural_Area': 46,
 '85': 50,
 '70': 36,
 '40': 25}

In [45]:
vocab = predefined_tokenizer.get_vocab()
sorted_vocab = {k:v for k, v in sorted(vocab.items(), key=lambda item: item[1])}
sorted_vocab

{'House': 0,
 'Office': 1,
 'School': 2,
 'Back_Home': 3,
 'Business_Place': 4,
 'Private_Space': 5,
 'Commute': 6,
 'Socializing': 7,
 'Store_Daily': 8,
 'Female': 9,
 'Male': 10,
 'Go_School': 11,
 'Shopping_Daily': 12,
 'Tourist_Spot': 13,
 'Private_Movement': 14,
 'Go_Eat': 15,
 'Unemployed': 16,
 'Student': 17,
 'Housewife': 18,
 'Technical_Worker': 19,
 'Welcoming': 20,
 'Attend_Meeting': 21,
 'Office_Worker': 22,
 '35': 23,
 'Go_Other_Business': 24,
 '40': 25,
 '60': 26,
 '55': 27,
 'Service_Worker': 28,
 '30': 29,
 '65': 30,
 'Pickup_Drop_Off': 31,
 '45': 32,
 'Delivering': 33,
 'Go_Sightseeing': 34,
 '50': 35,
 '70': 36,
 '25': 37,
 'Go_Occupation': 38,
 'Managerial_Worker': 39,
 '10': 40,
 '5': 41,
 '20': 42,
 '15': 43,
 '75': 44,
 'Sales_Worker': 45,
 'Natural_Area': 46,
 'Labor_Worker': 47,
 '80': 48,
 'Other_Occupation': 49,
 '85': 50,
 'Traffic_Worker': 51,
 'Agriculture_Worker': 52,
 'Unclear': 53,
 'nan': 54,
 'Security_Worker': 55,
 'Go_Agricultural_Work': 56}

In [52]:
predefined_vocab = {
     # '[UNK]': 0,
     # '[EOS]': 1,
     # '[PAD]': 2,
     'nan': 0,
     # activities 
     'House': 1,
     'Commute': 2,
     'Office': 3,
     'Go_School': 4,
     'School': 5,
     'Back_Home': 6,
     'Attend_Meeting': 7,
     'Go_Occupation': 8,
     'Delivering': 9,
     'Go_Other_Business': 10,
     'Business_Place': 11,
     'Private_Movement': 12,
     'Private_Space': 13,
     'Go_Eat': 14,
     'Socializing': 15,
     'Shopping_Daily': 16,
     'Store_Daily': 17,  
     'Welcoming': 18,
     'Pickup_Drop_Off': 19,
     'Go_Sightseeing': 20,
     'Tourist_Spot': 21,
     'Natural_Area': 22,
     # Occupation
     'Office_Worker': 23,
     'Technical_Worker': 24,
     'Service_Worker': 25,
     'Managerial_Worker': 26,
     'Sales_Worker': 27,
     'Security_Worker': 28,
     'Go_Agricultural_Work': 29,
     'Agriculture_Worker': 30,
     'Labor_Worker': 31,
     'Traffic_Worker': 32,
     'Other_Occupation': 33,
     'Student': 34,
     'Housewife': 35,
     'Unemployed': 36,
     'Unclear': 37,
     # Gender
     'Male': 38,
     'Female': 39,
     # Age
     '5': 40,
     '10': 41,
     '15': 42,
     '20': 43,
     '25': 44,
     '30': 45,
     '35': 46,
     '40': 47,
     '45': 48,
     '50': 49,
     '55': 50,
     '60': 51,
     '65': 52,
     '70': 53,
     '75': 54,
     '80': 55,
     '85': 56,
     }

In [53]:
new_tokenizer = Tokenizer(WordLevel(vocab=predefined_vocab))
new_tokenizer.pre_tokenizer = WhitespaceSplit()

fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=new_tokenizer)
fast_tokenizer.add_special_tokens({'pad_token': '[PAD]', 'eos_token': '[EOS]', 'unk_token': '[UNK]'})
fast_tokenizer.save_pretrained("PTtokenizer/Tokyo")

('PTtokenizer/Tokyo/tokenizer_config.json',
 'PTtokenizer/Tokyo/special_tokens_map.json',
 'PTtokenizer/Tokyo/tokenizer.json')