## Google Colab仕様

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/
!tar -xvf /home/topics.tar.bz2

/content/drive/MyDrive
topics/
topics/経済/
topics/経済/2016-09-30/
topics/経済/2016-09-30/topic-0009/
topics/経済/2016-09-30/topic-0009/utterance.xml
topics/経済/2016-09-30/topic-0009/utterance_192KHz.wav
topics/経済/2016-09-30/topic-0009/utterance_48KHz.wav
topics/経済/2016-09-30/topic-0009/utterance_16KHz.wav
topics/経済/2016-09-15/
topics/経済/2016-09-15/topic-0036/
topics/経済/2016-09-15/topic-0036/utterance.xml
topics/経済/2016-09-15/topic-0036/utterance_192KHz.wav
topics/経済/2016-09-15/topic-0036/utterance_48KHz.wav
topics/経済/2016-09-15/topic-0036/utterance_16KHz.wav
topics/経済/2016-09-17/
topics/経済/2016-09-17/topic-0018/
topics/経済/2016-09-17/topic-0018/utterance.xml
topics/経済/2016-09-17/topic-0018/utterance_192KHz.wav
topics/経済/2016-09-17/topic-0018/utterance_48KHz.wav
topics/経済/2016-09-17/topic-0018/utterance_16KHz.wav
topics/経済/2016-09-29/
topics/経済/2016-09-29/topic-0014/
topics/経済/2016-09-29/topic-0014/utterance.xml
topics/経済/2016-09-29/topic-0014/utterance_192KHz.wav
topics/経済/2016-09-29/topic-001

In [None]:
!pip install transformers
!pip install evaluate
!pip install fugashi
!pip install ipadic
!pip install accelerate -U
!pip install wandb



In [3]:
%cd /content/drive/MyDrive/topics

/content/drive/MyDrive/topics


## Data preparing

### Load data from xml files

In [4]:
import xml.etree.ElementTree as ET
import pandas as pd
from os import walk
from os.path import join

In [5]:
def get_data_from_xml(id, xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    article = root[0]
    data = []
    for i, sent in enumerate(article):
        text = ""
        pause_list = []
        for j, phrase in enumerate(sent):
            # 文節と文末にポーズ符号"_"追加
            text += phrase.text + "_"
            # ポーズ計算
            if i == len(article) - 1 and j == len(sent) - 1:    # 最後の文の最後の文節
                pause = 0
            else:
                if j == len(sent) - 1:  # 各文の最後の文節
                    next_sent = article[i + 1]
                    next_phrase = next_sent[0]
                else:
                    next_phrase = sent[j + 1]
                pause = (int(next_phrase.attrib['start_time']) - int(phrase.attrib['end_time'])) * pow(10, -3)
            pause_list.append(pause)
        id += 1
        data += [{"id": id, "text": text, "pause_list": pause_list, "file_path": xml_file}]
    return id, data

def get_all_filepath(dir_path):
    file_path = []
    for root, _, files in walk(dir_path):
        for f in files:
            if ".xml" in f:
                file_path.append(join(root, f))
    return file_path

In [6]:
# dir_path = "/mnt/aoni04/hsieh/topics/"
dir_path = "/content/drive/MyDrive/topics/" # for google colab
xml_file_path = get_all_filepath(dir_path)

# get all data
id = 0
all_data = []
for xml_file in xml_file_path:
    id, data = get_data_from_xml(id, xml_file)
    all_data += data

# show all data
df = pd.DataFrame(all_data)
df.head()

KeyboardInterrupt: ignored

In [7]:
len(xml_file_path)

100

In [None]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df, train_size=0.7)
eval_df, test_df = train_test_split(eval_df, train_size=0.5)
print('train size', train_df.shape)
print('eval size', eval_df.shape)
print('test size', test_df.shape)


train size (431, 4)
eval size (92, 4)
test size (93, 4)


### Tokenize text

In [None]:
from transformers import AutoTokenizer
# tokenize
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

all_data_pause_list = [data['pause_list'] for data in all_data]
all_data_text_tokenized = tokenizer([data['text'] for data in all_data], padding="longest")
print(tokenizer.decode(all_data_text_tokenized['input_ids'][0]))
print(all_data_text_tokenized['input_ids'][0])

[CLS] espot って _ いう _ スマートフォン とか _ pc の _ 充電 を _ 可能 に する _ 公衆 電源 サービス が _ ある ん だ けど 、 _ その _ サービス の _ 実証 実験 が _ 始まる そう だ よ 。 _ [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[2, 3152, 8983, 1907, 6172, 1679, 625, 1679, 23509, 10294, 1679, 1547, 28658, 5, 1679, 16278, 11, 1679, 519, 7, 34, 1679, 12616, 7129, 1645, 14, 1679, 31, 1058, 75, 11218, 6, 1679, 59, 1679, 1645, 5, 1679, 10697, 2458, 14, 1679, 3652, 1778, 75, 54, 8, 1679, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Dataset

### Define Dataset

In [None]:
from torch.utils.data import Dataset
from tqdm import tqdm

class pauseDataset(Dataset):
    def __init__(self, df):
        # define attribute
        self.features = [
            {
                'text': row.text,
                'pause_list': row.pause_list,
            } for row in tqdm(df.itertuples(), total=df.shape[0])
        ]

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

In [None]:
train_dataset = pauseDataset(train_df)
eval_dataset = pauseDataset(eval_df)
test_dataset = pauseDataset(test_df)
train_dataset[10]

100%|██████████| 431/431 [00:00<00:00, 166766.15it/s]
100%|██████████| 92/92 [00:00<00:00, 13780.79it/s]
100%|██████████| 93/93 [00:00<00:00, 307383.98it/s]


{'text': 'ｍａｖｉｃ\u3000ｐｒｏで_子供の_撮影を_行ったり、_家族と_共有できる_時間を_増やして、_生活を_変えるなど、_人々に_楽しんでもらうのが_ｄｊｉの_望みだって。_',
 'pause_list': [0.314,
  0.0,
  0.0,
  0.78,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.3,
  0.0,
  0.502,
  0.0,
  0]}

### Define DataCollator

In [None]:
import torch
from transformers import AutoTokenizer

class pauseCollator():
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        examples = {
            'text': list(map(lambda x: x['text'], examples)),
            'pause_list': list(map(lambda x: x['pause_list'], examples))
        }

        encodings = self.tokenizer(examples['text'],
                                   padding='longest',
                                   return_tensors='pt')
        encodings['labels'] = torch.tensor([
                                                self.align_labels_with_tokens(pause_list, input_ids)
                                                for pause_list, input_ids in zip(examples['pause_list'], encodings['input_ids'])
                                            ])
        return encodings

    def align_labels_with_tokens(self, labels, input_ids):
        new_labels = []
        label_idx = 0
        for input_id in input_ids:
            word = tokenizer.decode(input_id)
            if word == '_':
                if label_idx > len(labels) - 1:
                    print(tokenizer.decode(input_ids))
                    print(labels)
                new_labels.append(labels[label_idx])
                label_idx += 1
            else:
                new_labels.append(-100)
        return new_labels

tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')
pause_collator = pauseCollator(tokenizer)


### Not use

In [None]:
from torch.utils.data import DataLoader
loader = DataLoader(train_dataset, collate_fn=pause_collator, batch_size=8, shuffle=True)
batch = next(iter(loader))
for k,v in batch.items():
    print(k, v.shape)
batch['labels'][0]

input_ids torch.Size([8, 56])
token_type_ids torch.Size([8, 56])
attention_mask torch.Size([8, 56])
labels torch.Size([8, 56])


tensor([-100.0000, -100.0000, -100.0000, -100.0000, -100.0000,    0.0000,
        -100.0000, -100.0000,    0.4800, -100.0000, -100.0000, -100.0000,
           0.0000, -100.0000, -100.0000, -100.0000, -100.0000,    0.0000,
        -100.0000, -100.0000, -100.0000, -100.0000,    1.4000, -100.0000,
        -100.0000, -100.0000, -100.0000,    0.5050, -100.0000, -100.0000,
        -100.0000, -100.0000,    0.0000, -100.0000, -100.0000, -100.0000,
        -100.0000, -100.0000, -100.0000, -100.0000, -100.0000, -100.0000,
        -100.0000, -100.0000, -100.0000, -100.0000, -100.0000, -100.0000,
        -100.0000, -100.0000, -100.0000, -100.0000, -100.0000, -100.0000,
        -100.0000, -100.0000])

### Not use

In [None]:
class pauseDataset(Dataset):
    def __init__(self, tokenized_texts, pause_lists):
        # define attribute
        self.max_text_len = 180
        self.tokenized_texts = tokenized_texts
        self.pause_lists = pause_lists

    def __len__(self):
        return len(self.tokenized_texts['input_ids'])

    def __getitem__(self, idx):
        input_ids = self.tokenized_texts['input_ids'][idx]
        token_type_ids = self.tokenized_texts['token_type_ids'][idx]
        attention_mask = self.tokenized_texts['attention_mask'][idx]
        pause_list = self.align_labels_with_tokens(self.pause_lists[idx], input_ids)

        item = {    'input_ids': torch.tensor(input_ids),
                    'token_type_ids': torch.tensor(token_type_ids),
                    'attention_mask': torch.tensor(attention_mask),
                    'pause_list': torch.tensor(pause_list)
                }
        return item

    def align_labels_with_tokens(self, labels, input_ids):
        new_labels = []
        label_idx = 0
        for input_id in input_ids:
            word = tokenizer.decode(input_id)
            if word == '/':
                new_labels.append(labels[label_idx])
                label_idx += 1
            else:
                new_labels.append(-100)
        return new_labels


In [None]:
# data set
dataset = pauseDataset(all_data_text_tokenized, all_data_pause_list)

# split dataset into train/valid/test set
train_set, val_set, test_set = torch.utils.data.random_split(dataset=dataset, lengths=[0.8, 0.1, 0.1], generator=torch.Generator().manual_seed(42))
train_set[0]

{'input_ids': tensor([    2, 24407,   396,     5,   465,   426,   104,   558,     9,     6,
           465, 26068,    14,   465,   705,     5,   465,  2267,    12,    28,
           465,  9447,   104,   308,    10,   465,    45,    11,   465,  2214,
           465,  5343, 23603,   465,  3083,  7134,     8,   465,     3,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Model

### Define Metric

In [None]:
import evaluate

# regression problem
metric = evaluate.load('mse')

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = logits.squeeze()

    flatten = lambda x: [z for y in x for z in (flatten(y) if hasattr(y, '__iter__') and not isinstance(y, str) else (y,))]
    true_labels = flatten([[l for l in label if l != -100] for label in labels])
    true_predictions = flatten([
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ])

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    print(true_labels)
    print(true_predictions)
    return {"mse": all_metrics["mse"]}


Downloading builder script:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

### Define Model

In [None]:
from transformers import AutoModelForTokenClassification

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForTokenClassification.from_pretrained("cl-tohoku/bert-base-japanese", num_labels=1).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training

### Training Args

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir='/content/drive/MyDrive/topics/output',
    evaluation_strategy="epoch",
    logging_strategy='epoch',
    save_strategy="epoch",
    save_total_limit=1,
    lr_scheduler_type='constant',
    learning_rate=2e-5,
    metric_for_best_model='mse',
    num_train_epochs=20,
    weight_decay=0.01,
    remove_unused_columns=False,
    report_to='wandb'
)

### Trainer

In [None]:
from torch.nn import MSELoss
from transformers import Trainer

class RegressionTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    ignore_idxs = torch.tensor([[0 if l == -100 else 1 for l in label ]for label in labels]).to(device)

    activate_labels = labels * ignore_idxs

    outputs = model(**inputs)
    logits = outputs.get('logits')
    predictions = logits.squeeze()
    activate_predictions = predictions * ignore_idxs

    loss_fct = MSELoss()
    loss = loss_fct(activate_predictions, activate_labels)

    # print(loss)
    return (loss, outputs) if return_outputs else loss

In [None]:
trainer = RegressionTrainer(
  model=model,
  args=args,
  train_dataset=train_dataset,
  eval_dataset=eval_dataset,
  data_collator=pause_collator,
  compute_metrics=compute_metrics,
  tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Mse
1,0.0311,0.021785,0.157518
2,0.0207,0.026597,0.190622
3,0.0173,0.02026,0.147652
4,0.0136,0.024728,0.177357
5,0.0098,0.022207,0.161224
6,0.0091,0.023286,0.171413
7,0.0063,0.021695,0.157061
8,0.0045,0.02046,0.149307
9,0.0044,0.019384,0.140274
10,0.0031,0.019654,0.141882


[0.0, 0.0, 0.332, 0.024, 0.045, 0.0, 0.437, 0.0, 0.0, 0.299, 0.048, 0.0, 0.0, 2.26, 0.352, 0.0, 0.0, 0.0, 2.736, 0.0, 0.292, 1.104, 0.0, 0.0, 0.0, 0.48, 0.0, 0.0, 0.325, 0.0, 0.0, 0.0, 2.514, 0.269, 0.0, 0.732, 0.0, 0.0, 0.266, 0.0, 0.0, 0.0, 0.0, 0.002, 3.418, 0.669, 0.0, 0.0, 0.0, 0.173, 2.383, 0.568, 0.001, 0.438, 0.0, 0.0, 1.2, 0.0, 0.293, 0.0, 0.0, 0.0, 1.036, 0.0, 0.291, 0.0, 0.382, 0.0, 0.0, 0.0, 0.0, 0.0, 1.425, 0.0, 0.433, 0.0, 1.218, 0.0, 0.0, 0.0, 0.0, 3.128, 0.012, 0.044, 1.784, 0.0, 0.524, 0.0, 0.0, 0.473, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.705, 0.0, 0.0, 0.839, 0.0, 0.0, 0.328, 0.0, 2.482, 0.0, 0.0, 0.45, 0.0, 0.0, 0.0, 0.65, 0.252, 0.0, 2.073, 0.0, 0.0, 0.329, 0.365, 0.0, 0.0, 3.07, 0.267, 0.0, 0.0, 0.0, 0.0, 0.805, 0.0, 0.196, 0.776, 0.0, 0.0, 0.0, 0.452, 2.41, 0.533, 0.038, 0.548, 0.0, 0.215, 0.0, 0.413, 0.0, 0.0, 0.0, 0.0, 0.256, 0.0, 0.604, 0.0, 0.0, 0.0, 0.0, 1.634, 0.401, 0.46, 0.317, 0.0, 0.0, 0.0, 1.351, 0.0, 0.0, 0.857, 0.008, 0.028, 0.408, 0.0, 0.0, 0.427, 0.0, 0.

TrainOutput(global_step=1080, training_loss=0.0069765690024252295, metrics={'train_runtime': 395.166, 'train_samples_per_second': 21.814, 'train_steps_per_second': 2.733, 'total_flos': 308635631085096.0, 'train_loss': 0.0069765690024252295, 'epoch': 20.0})

In [None]:
import wandb
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,▃▄█▃▆▄▅▄▃▂▂▂▃▂▁▂▂▁▁▁▁
eval/mse,▃▄█▃▆▄▆▄▃▂▂▂▃▂▁▂▂▁▁▁▁
eval/runtime,▃▃▄▂▃▃▁▁▁▃▃▂▁▃▃▂▃▃▃█▃
eval/samples_per_second,▄▅▃▆▅▅▇██▅▅▅█▅▄▅▄▅▅▁▅
eval/steps_per_second,▄▅▃▆▅▅▇██▅▅▅█▅▄▅▄▅▅▁▅
train/epoch,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,██▅▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁

0,1
eval/loss,0.01842
eval/mse,0.13282
eval/runtime,1.0364
eval/samples_per_second,88.768
eval/steps_per_second,11.578
train/epoch,20.0
train/global_step,1080.0
train/learning_rate,2e-05
train/loss,0.0014
train/total_flos,308635631085096.0


In [None]:
trainer.save_state()
trainer.save_model()

# Inference

In [None]:
from transformers import AutoModelForTokenClassification

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "/content/drive/MyDrive/topics/output"
pause_predictor_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=1).to(device)

In [None]:
for test in test_dataset:
  del test["pause_list"]

In [None]:
len(test_dataset)

93

In [None]:
input = pause_collator(test_dataset)

In [None]:
del input["labels"]

In [None]:
input = input.to(device)

In [None]:
input['input_ids'].size()

torch.Size([93, 109])

In [None]:
# input = pause_collator(test_dataset).to(device)
pt_outputs = pause_predictor_model(**input)

In [None]:
pt_outputs.logits.size()

torch.Size([93, 109, 1])

In [None]:
predict = pt_outputs.logits.squeeze()

torch.Size([93, 109])