In [1]:
!pip install transformers
!pip install evaluate
!pip install sentencepiece 
!pip install rouge_score
!pip install fugashi
!pip install ipadic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 28.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [1]:
import pandas as pd
import numpy as np

from transformers import pipeline, AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, create_optimizer, AutoModel
from transformers.keras_callbacks import PushToHubCallback
from datasets import Dataset, DatasetDict
import evaluate

import tensorflow as tf

from huggingface_hub import notebook_login

In [2]:
notebook_login() # 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

Login successful
Your token has been saved to /root/.huggingface/token


In [4]:
!git config --global credential.helper store

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
base_path = '/content/drive/MyDrive/ml_theory_final_project/'
wiki_df = pd.read_pickle(base_path + 'wiki.pkl') 

In [7]:
wiki_df.head()

Unnamed: 0,article,views,text
0,メインページ,362562853,ようこそ\nウィキペディア - ウィキペディア日本語版 - 百科事典目次\n検索資料・ポータ...
1,星野源,10190763,星野 源（ほしの みなもと、1981年1月28日 - ）は、日本の音楽家、俳優、文筆家。埼玉...
2,真田信繁,9602104,真田 信繁（さなだ のぶしげ）は、安土桃山時代から江戸時代初期にかけての武将、大名。真田昌幸...
3,高橋一生,8571666,高橋 一生（たかはし いっせい、英字表記：Issey Takahashi、1980年12月9...
4,君の名は。,7788879,『君の名は。』（きみのなは、英: Your Name.）は、2016年に公開された新海誠監督...


In [6]:
def get_labels(df, split_on="\n\n\n=="):
    """extract summarization label from wikipedia text

    Args:
        df (pd.Series): data containing wiki text
        split_on (str, optional): delimiter to identify end of first paragraph aka the label. Defaults to "\n\n\n==".

    Returns:
        pd.Series: series containing the first paragraph aka the label
    """
    labels = df.str.split(split_on).str[0] # label is first paragraph

    return labels

# targets = get_labels(wiki_df_iqr)

In [7]:
wiki_df['targets'] = get_labels(wiki_df['text'])
df_targets = wiki_df['targets']
wiki_tar_len = df_targets.apply(len)
len_lower = np.percentile(wiki_tar_len, 25)
len_upper = np.percentile(wiki_tar_len, 75)
wiki_df_iqr = wiki_df[(wiki_tar_len >= len_lower) & (wiki_tar_len <= len_upper)].reset_index(drop=True)
# wiki_df_iqr = wiki_df[(wiki_tar_len <= len_upper)].reset_index(drop=True)
# wiki_df_iqr = wiki_df_iqr.sample(n=int(wiki_df_iqr.shape[0]/2))

In [10]:
wiki_df_iqr['text'].apply(len).describe()

count      4251.000000
mean      10379.687838
std       13898.572163
min         149.000000
25%        3103.000000
50%        6086.000000
75%       11897.500000
max      196567.000000
Name: text, dtype: float64

In [11]:
wiki_df_iqr['targets'].apply(len).describe()

count    4251.000000
mean      138.857210
std        36.840765
min        86.000000
25%       107.000000
50%       133.000000
75%       168.000000
max       216.000000
Name: targets, dtype: float64

In [10]:
# create hugging face train val test dataset dict
wiki_dataset = Dataset.from_pandas(wiki_df_iqr)
train_test_val = wiki_dataset.train_test_split(test_size=0.1) # train = 90% of data
test_val = train_test_val['test'].train_test_split(test_size=0.5) # test and val = 5% of data
wiki_datadict = DatasetDict({
    'train': train_test_val['train'],
    'test': test_val['test'],
    'val': test_val['train']
})

wiki_datadict

# TODO: if training doesn't work out well, try filtering dataset to texts and targets that are of a certain length before trainingb

DatasetDict({
    train: Dataset({
        features: ['article', 'views', 'text', 'targets'],
        num_rows: 3825
    })
    test: Dataset({
        features: ['article', 'views', 'text', 'targets'],
        num_rows: 213
    })
    val: Dataset({
        features: ['article', 'views', 'text', 'targets'],
        num_rows: 213
    })
})

In [13]:
model_checkpoint = "google/mt5-small"
# model_checkpoint = "cl-tohoku/bert-base-japanese"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


In [14]:
max_input_length = 512 # T5 max
max_target_length = 150 # T5 max

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"], max_length=max_input_length, truncation=True, padding='max_length'
    )
    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["targets"], max_length=max_target_length, truncation=True, padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = wiki_datadict.map(preprocess_function, batched=True)
tokenized_datasets

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'views', 'text', 'targets', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3825
    })
    test: Dataset({
        features: ['article', 'views', 'text', 'targets', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 213
    })
    val: Dataset({
        features: ['article', 'views', 'text', 'targets', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 213
    })
})

In [15]:
# rouge score for evaluation metric
rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/5.86k [00:00<?, ?B/s]

In [16]:
# prepare dataset for tf model training
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# model = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", padding=True)
tokenized_datasets = tokenized_datasets.remove_columns(
    wiki_datadict["train"].column_names
)

Downloading tf_model.h5:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFMT5ForConditionalGeneration.

All the layers of TFMT5ForConditionalGeneration were initialized from the model checkpoint at google/mt5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMT5ForConditionalGeneration for predictions without further training.


In [17]:
# convert huggingface tokenized data to tensorflow tensors for model training
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=2,
)
tf_eval_dataset = tokenized_datasets["val"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=2,
)

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_epochs = 8
num_train_steps = len(tf_train_dataset) * num_train_epochs
model_name = model_checkpoint.split("/")[-1]

optimizer, schedule = create_optimizer(
    init_lr=5.6e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)

model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# save model periodically during training and store in huggingface model hub
callback = PushToHubCallback(
    output_dir=f"{model_name}-finetuned-wikipedia-summarization-jp-t5-limitations", tokenizer=tokenizer
)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
Cloning https://huggingface.co/bearbearchu/mt5-small-finetuned-wikipedia-summarization-jp-t5-limitations into local empty directory.


In [18]:
model.fit(
    tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback], epochs=8
)

Epoch 1/8

Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file tf_model.h5:   0%|          | 3.33k/1.12G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/bearbearchu/mt5-small-finetuned-wikipedia-summarization-jp-t5-limitations
   96b5293..c3dadfb  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/bearbearchu/mt5-small-finetuned-wikipedia-summarization-jp-t5-limitations
   96b5293..c3dadfb  main -> main



<keras.callbacks.History at 0x7f719e917850>

In [19]:
#TODO: implement rouge scoring for evaluation and test datasets

In [8]:
hub_model_id = "bearbearchu/mt5-small-finetuned-wikipedia-summarization-jp-t5-limitations"
summarizer = pipeline("summarization", model=hub_model_id)

def print_summary(idx):
    text = wiki_datadict["test"][idx]["text"][:300] # google translate maxes at 3900 english characters
    first_par = wiki_datadict["test"][idx]["targets"]
    summary = summarizer(wiki_datadict["test"][idx]["text"])[0]["summary_text"]
    print(f"'>>> Wiki: {text}'")
    print(f"\n'>>> First Paragraph: {first_par}'")
    print(f"\n'>>> Summary: {summary}'")
    text=None
    first_par=None
    summary=None

# print_summary(0)

All model checkpoint layers were used when initializing TFMT5ForConditionalGeneration.

All the layers of TFMT5ForConditionalGeneration were initialized from the model checkpoint at bearbearchu/mt5-small-finetuned-wikipedia-summarization-jp-t5-limitations.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMT5ForConditionalGeneration for predictions without further training.


In [11]:
print_summary(0)

'>>> Wiki: 佐野 勇斗（さの はやと、1998年3月23日 - ）は、日本の俳優、歌手。愛知県出身。スターダストプロモーション所属。同社の若手男性アーティスト集団EBiDANのEBiDAN39メンバーであり、4人組ボーカルダンスユニットM!LKのメンバーである。


== 略歴 ==
第25回ジュノン・スーパーボーイ・コンテストで、スターダストの関係者の目にとまりスカウトされる。。2014年にM!LKのメンバーとなり、2015年3月にシングル「コーヒーが飲めません」でCDデビュー。同年公開の映画『くちびるに歌を』で俳優デビューした。大学受験のために半年間M!LKの活動を休業し、2016年に大学に進学。愛知'

'>>> First Paragraph: 佐野 勇斗（さの はやと、1998年3月23日 - ）は、日本の俳優、歌手。愛知県出身。スターダストプロモーション所属。同社の若手男性アーティスト集団EBiDANのEBiDAN39メンバーであり、4人組ボーカルダンスユニットM!LKのメンバーである。'

'>>> Summary: 佐野 勇斗(さの はやと、1998年3月23日'
