In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
ROOT_PATH="/content/drive/My Drive/codes/my_github/RelationExtractor/"

In [3]:
!pip install -qU torch==1.7.1 torchtext==0.8.0 torchvision==0.8.2
!pip install -q transformers==4.4.2 pytorch_lightning==1.2.1 sentencepiece

[K     |████████████████████████████████| 776.8 MB 15 kB/s 
[K     |████████████████████████████████| 6.9 MB 32.6 MB/s 
[K     |████████████████████████████████| 12.8 MB 30.0 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 0.12.1+cu113 requires torch==1.12.1, but you have torch 1.7.1 which is incompatible.[0m
[K     |████████████████████████████████| 2.0 MB 4.1 MB/s 
[K     |████████████████████████████████| 814 kB 50.9 MB/s 
[K     |████████████████████████████████| 1.3 MB 55.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 56.5 MB/s 
[K     |████████████████████████████████| 880 kB 83.3 MB/s 
[K     |████████████████████████████████| 596 kB 77.4 MB/s 
[K     |████████████████████████████████| 829 kB 76.3 MB/s 
[K     |████████████████████████████████| 141 kB 84.4 MB/s 
[?25h  Building wheel for future (setup.py) ... 

In [5]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

# 乱数シードの設定
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [6]:
# 事前学習済みモデル
PRETRAINED_MODEL_NAME = "sonoisa/t5-base-japanese"

# 転移学習済みモデル
MODEL_DIR = ROOT_PATH+"model/result_t5/"

In [11]:
# GPU利用有無
USE_GPU = torch.cuda.is_available()

# 各種ハイパーパラメータ
args_dict = dict(
    data_dir=ROOT_PATH+"/mldata/profgen",  # データセットのディレクトリ
    model_name_or_path=PRETRAINED_MODEL_NAME,
    tokenizer_name_or_path=PRETRAINED_MODEL_NAME,

    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    gradient_accumulation_steps=1,


    n_gpu=1 if USE_GPU else 0,
    early_stop_callback=False,
    fp_16=False,
    opt_level='O1',
    max_grad_norm=1.0,
    seed=42,
)

In [12]:
class ProfDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, input_min_len=20,input_max_len=512, target_max_len=512):
        # self.file_path = os.path.join(data_dir, type_path)
        self.file_path = f"{data_dir}/{type_path}.src-tgt"
        
        self.input_min_len = input_min_len
        self.input_max_len = input_max_len
        self.target_max_len = target_max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()
  
    def __len__(self):
        return len(self.inputs)
  
    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        source_mask = self.inputs[index]["attention_mask"].squeeze()
        target_mask = self.targets[index]["attention_mask"].squeeze()

        return {"source_ids": source_ids, "source_mask": source_mask, 
                "target_ids": target_ids, "target_mask": target_mask}

    def _make_record(self, title, body, genre_id):
        # ニュースタイトル生成タスク用の入出力形式に変換する。
        input = f"{body}"
        target = f"{title}"
        return input, target
  
    def _build(self):
        print(self.file_path+".src")
        print(self.file_path+".tgt")
        source_file = open(self.file_path+".src",encoding="utf-8")
        target_file = open(self.file_path+".tgt",encoding="utf-8")
        
        for input,target in zip(source_file,target_file):
            if len(target)<=self.input_min_len:continue
            # print("input",input)
            # print("target",target)
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input], max_length=self.input_max_len, truncation=True, 
                padding="max_length", return_tensors="pt"
            )

            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.target_max_len, truncation=True, 
                padding="max_length", return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [13]:
# トークナイザー（SentencePiece）モデルの読み込み
tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME, is_fast=True)

# テストデータセットの読み込み
train_dataset = ProfDataset(tokenizer, args_dict["data_dir"], "train", input_min_len=20,
                           input_max_len=32, target_max_len=128)
                          #  input_max_len=512, target_max_len=64)

/content/drive/My Drive/codes/my_github/RelationExtractor//mldata/profgen/train.src-tgt.src
/content/drive/My Drive/codes/my_github/RelationExtractor//mldata/profgen/train.src-tgt.tgt


In [15]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.hparams = hparams

        # 事前学習済みモデルの読み込み
        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)

        # トークナイザーの読み込み
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path, is_fast=True)

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, 
                decoder_attention_mask=None, labels=None):
        """順伝搬"""
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )

    def _step(self, batch):
        """ロス計算"""
        labels = batch["target_ids"]

        # All labels set to -100 are ignored (masked), 
        # the loss is only computed for labels in [0, ..., config.vocab_size]
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            labels=labels
        )

        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        """訓練ステップ処理"""
        loss = self._step(batch)
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        """バリデーションステップ処理"""
        loss = self._step(batch)
        self.log("val_loss", loss)
        return {"val_loss": loss}

    def test_step(self, batch, batch_idx):
        """テストステップ処理"""
        loss = self._step(batch)
        self.log("test_loss", loss)
        return {"test_loss": loss}

    def configure_optimizers(self):
        """オプティマイザーとスケジューラーを作成する"""
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() 
                            if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() 
                            if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, 
                          lr=self.hparams.learning_rate, 
                          eps=self.hparams.adam_epsilon)
        self.optimizer = optimizer

        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.hparams.warmup_steps, 
            num_training_steps=self.t_total
        )
        self.scheduler = scheduler

        return [optimizer], [{"scheduler": scheduler, "interval": "step", "frequency": 1}]

    def get_dataset(self, tokenizer, type_path, args):
        """データセットを作成する"""
        return ProfDataset(
            tokenizer=tokenizer, 
            data_dir=args.data_dir, 
            type_path=type_path, 
            input_max_len=args.max_input_length,
            target_max_len=args.max_target_length)
    
    def setup(self, stage=None):
        """初期設定（データセットの読み込み）"""
        if stage == 'fit' or stage is None:
            train_dataset = self.get_dataset(tokenizer=self.tokenizer, 
                                             type_path="train", args=self.hparams)
            self.train_dataset = train_dataset

            val_dataset = self.get_dataset(tokenizer=self.tokenizer, 
                                           type_path="valid", args=self.hparams)
            self.val_dataset = val_dataset

            self.t_total = (
                (len(train_dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                // self.hparams.gradient_accumulation_steps
                * float(self.hparams.num_train_epochs)
            )

    def train_dataloader(self):
        """訓練データローダーを作成する"""
        return DataLoader(self.train_dataset, 
                          batch_size=self.hparams.train_batch_size, 
                          drop_last=True, shuffle=True, num_workers=4)

    def val_dataloader(self):
        """バリデーションデータローダーを作成する"""
        return DataLoader(self.val_dataset, 
                          batch_size=self.hparams.eval_batch_size, 
                          num_workers=4)

In [16]:
# 学習に用いるハイパーパラメータを設定する
args_dict.update({
    "max_input_length":  32,  # 入力文の最大トークン数
    "max_target_length": 128,  # 出力文の最大トークン数
    "train_batch_size":  8,  # 訓練時のバッチサイズ
    "eval_batch_size":   8,  # テスト時のバッチサイズ
    "num_train_epochs":  8,  # 訓練するエポック数
    })
args = argparse.Namespace(**args_dict)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
)

In [None]:
# 転移学習の実行（GPUを利用すれば1エポック10分程度）
model = T5FineTuner(args)
trainer = pl.Trainer(**train_params)
trainer.fit(model)

# 最終エポックのモデルを保存
model.tokenizer.save_pretrained(MODEL_DIR)
model.model.save_pretrained(MODEL_DIR)


Downloading:   0%|          | 0.00/710 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

GPU available: True, used: True
INFO:lightning:GPU available: True, used: True
TPU available: None, using: 0 TPU cores
INFO:lightning:TPU available: None, using: 0 TPU cores


/content/drive/My Drive/codes/my_github/RelationExtractor//mldata/profgen/train.src-tgt.src
/content/drive/My Drive/codes/my_github/RelationExtractor//mldata/profgen/train.src-tgt.tgt
/content/drive/My Drive/codes/my_github/RelationExtractor//mldata/profgen/valid.src-tgt.src
/content/drive/My Drive/codes/my_github/RelationExtractor//mldata/profgen/valid.src-tgt.tgt



  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)
INFO:lightning:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

In [None]:

# 転移学習済みモデル
MODEL_DIR = ROOT_PATH+"/model/result_t5/"#pytorch_model.bin"
# トークナイザー（SentencePiece）モデルの読み込み
tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME, is_fast=True)
# 学習済みモデル
trained_model = T5ForConditionalGeneration.from_pretrained(MODEL_DIR)

# GPUの利用有無
USE_GPU = torch.cuda.is_available()
if USE_GPU:
    trained_model.cuda()

In [None]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

def get_output(args,args_dict,tokenizer,trained_model,file_type):
   # テストデータの読み込み
    test_dataset = ProfDataset(tokenizer, args_dict["data_dir"], file_type, 
                            input_max_len=args.max_input_length, 
                            target_max_len=args.max_target_length)

    test_loader = DataLoader(test_dataset, batch_size=8, num_workers=4)

    trained_model.eval()

    inputs = []
    outputs = []
    targets = []

    for batch in tqdm(test_loader):
        input_ids = batch['source_ids']
        input_mask = batch['source_mask']
        if USE_GPU:
            input_ids = input_ids.cuda()
            input_mask = input_mask.cuda()

        output = trained_model.generate(input_ids=input_ids, 
          attention_mask=input_mask, 
          max_length=args.max_target_length,
          repetition_penalty=10.0,   # 同じ文の繰り返し（モード崩壊）へのペナルティ
            )

        output_text = [tokenizer.decode(ids, skip_special_tokens=True, 
                              clean_up_tokenization_spaces=False) 
                  for ids in output]
        target_text = [tokenizer.decode(ids, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=False) 
                  for ids in batch["target_ids"]]
        input_text = [tokenizer.decode(ids, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=False) 
                  for ids in input_ids]

        inputs.extend(input_text)
        outputs.extend(output_text)
        targets.extend(target_text)
    return inputs,outputs,targets

In [None]:
for input, output, target in zip(*get_output(args,args_dict,tokenizer,trained_model,"test")):
    print("title:     " + input)
    print("generated: " + output)
    print("actual:    " + target)
    print()

In [None]:
for input, output, target in zip(*get_output(args,args_dict,tokenizer,trained_model,"eval")):
    print("title:     " + input)
    print("generated: " + output)
    # print("actual:    " + target)
    print()

In [None]:
for input,output, target  in zip(*get_output(args,args_dict,tokenizer,trained_model,"random")):
    print("title:     " + input)
    print("generated: " + output)
    # print("actual:    " + target)
    print()