In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# install all needed libraries
!pip install transformers
!pip install datasets
!pip install pypinyin
!pip install pkuseg
!pip install evaluate
!pip install difflib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 14.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 78.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 84.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 15.2 MB/s

In [3]:
# import libraries for transformers, segmentation, pinyin and etc...
from transformers import BertTokenizer, BartForConditionalGeneration, GPT2LMHeadModel, TextGenerationPipeline
import pkuseg
from pypinyin import Style, lazy_pinyin, pinyin
import string
import numpy as np
import torch
import math
import random
import evaluate 
import os

In [4]:
# connect to cuda if available
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# set the tokenizer and model using pretrained model from huggingface
tokenizer = BertTokenizer.from_pretrained("fnlp/bart-base-chinese")
model = BartForConditionalGeneration.from_pretrained("fnlp/bart-base-chinese")
seg = pkuseg.pkuseg()


Downloading vocab.txt:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Downloading pytorch_model.bin:   0%|          | 0.00/570M [00:00<?, ?B/s]

In [5]:
# Get the alphabet of capital letters
alphabet = string.ascii_letters[26:]

# The punctuation list
PUNCS = set([",", ".", "?", "!", ":", "，", "。", "？", "！", "："])

# The yunmu dictionary for rhyming: all yunmus in the same number group rhyme.
yunmus= {
            "0":["a", "ia", "ua", "va", "üa"],
            "1":["e", "o", "uo", "ie", "ue", "üe", "ve"],
            "2":["u"],
            "3":["i", "ü", "v"],
            "4":["ai", "uai"],
            "5":["ao", "iao"],
            "6":["ou", "iu", "iou"],
            "7":["an", "ian", "uan", "üan", "van"],
            "8":["en", "in", "un", "ün", "vn"],
            "9":["ang", "iang", "uang"],
            "10":["eng", "ing", "ueng", "ong", "iong"],
            "11":["er"],
            "12":["ei", "ui", "uei", "vei"],
           }

yun2id = {}
for yid, yws in yunmus.items():
    for w in yws:
        yun2id[w] = yid


In [6]:
# adding special mask tokens for each rhyming group 
num_added_toks = tokenizer.add_tokens(['[RHYME]'])
model.resize_token_embeddings(len(tokenizer)) 

Embedding(21129, 768)

In [7]:
# Return the yunmu of the last word in the input text
def yunmu_name(text):
    w = text[-1]
    if w in PUNCS and len(text) > 1:
        w = text[-2]
    yunmu = lazy_pinyin(w, style=Style.FINALS)
    yunmu = yunmu[0]
    return yunmu

# Return the corresponding yunmu_id of the yunmu
def yunmu_id(yunmu):
    if yunmu in yun2id:
        id = yun2id[yunmu]
    else:
        id = '-1'
    return id

# Return the corresponding schema of the sentences
def sents_to_schema(sents):
    m = 0
    all_id = []
    schema = ''
    for sent in sents:
        y_id = int(yunmu_id(yunmu_name(sent)))
        if y_id in all_id:
            schema += schema[all_id.index(y_id)]
        else:
            schema += alphabet[m]
            m += 1
        all_id.append(y_id)
    return schema

# clean the inputs, get rid of the spaces and punctuations and put the text into sentences in a list
def clean_text(text):
    text = text.replace(" ", "")
    for punc in PUNCS:
        text = text.replace(punc, " ")
    sents = text.split()
    return sents

def rhyme_mapping(sents):
    rhys = []
    for sent in sents:
        yunmu = yunmu_name(sent)
        rhys.append(yunmu)
    assert len(rhys) == len(sents)
    rhy_map = {}
    for i, r in enumerate(rhys):
        rid = yunmu_id(r)
        if rid in rhy_map:
            rhy_map[rid] += [i]
        else:
            rhy_map[rid] = [i]
    rhy_list = []
    for rid in rhy_map:
        rhy_list.append(rhy_map[rid])
    return rhy_map, rhy_list

def encode_text(sents, rhy_list):
    new_sents = list(sents)
    for i in rhy_list:
        ori_text = sents[i]
        segmentation = seg.cut(ori_text)
        mask_word = segmentation[-1]
        new_sents[i] = ori_text[:len(ori_text)-len(mask_word)] + '[RHYME]'
    new_text = '，'.join(new_sents)
    return new_text


In [8]:
# Evaluation model for perplexity
from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline
eval_tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-lyric")
eval_model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-lyric")

def eval_ppl(text):
    encodings = eval_tokenizer(text, return_tensors="pt")
    max_length = eval_model.config.n_positions
    length = encodings.input_ids.size(1)

    trg_len = length
    input_ids = encodings.input_ids[:, length - max_length:]
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = eval_model(input_ids, labels=target_ids)
        neg_log_likelihood = outputs[0] * trg_len

    ppl = torch.exp(neg_log_likelihood / length)
    return ppl

Downloading vocab.txt:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/253 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/401M [00:00<?, ?B/s]

In [9]:
data_path = r'/content/drive/MyDrive/MSc_Project/dataset/train_eval_test'

train_text = []
train_label = []
with open(os.path.join(data_path, 'train_data.txt'), 'r') as td:
    for line in td:
        line = line.rstrip()
        sents = clean_text(line)
        _, rhy_list = rhyme_mapping(sents)
        for i in range(len(rhy_list)):
          train_label.append(line)
          train_text.append(encode_text(sents, rhy_list[i]))

eval_text = []
eval_label = []
with open(os.path.join(data_path, 'eval_data.txt'), 'r') as ed:
    for line in ed:
        line = line.rstrip()
        sents = clean_text(line)
        _, rhy_list = rhyme_mapping(sents)
        for i in range(len(rhy_list)):
          eval_label.append(line)
          eval_text.append(encode_text(sents, rhy_list[i]))

test_text = []
test_label = []
with open(os.path.join(data_path, 'test_data.txt'), 'r') as ttd:
    for line in ttd:
        line = line.rstrip()
        sents = clean_text(line)
        _, rhy_list = rhyme_mapping(sents)
        for i in range(len(rhy_list)):
          test_label.append(line)
          test_text.append(encode_text(sents, rhy_list[i]))


'''
with open('/content/drive/MyDrive/MSc_Project/dataset/lyric.txt', 'r') as fp:
    text = fp.read().split('\n')

for i in range(len(text)):
    text[i] = encode_text(text[i])
'''

"\nwith open('/content/drive/MyDrive/MSc_Project/dataset/lyric.txt', 'r') as fp:\n    text = fp.read().split('\n')\n\nfor i in range(len(text)):\n    text[i] = encode_text(text[i])\n"

In [10]:
inputs = tokenizer(train_text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
eval_inputs = tokenizer(eval_text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
test_inputs = tokenizer(test_text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')

labels = tokenizer(train_label, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
eval_labels = tokenizer(eval_label, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
test_labels = tokenizer(test_label, return_tensors='pt', max_length=128, truncation=True, padding='max_length')

In [11]:
inputs['labels'] = labels.input_ids.detach().clone()
eval_inputs['labels'] = eval_labels.input_ids.detach().clone()
test_inputs['labels'] = test_labels.input_ids.detach().clone()

#inputs.keys()

In [12]:
class LyricsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [13]:
train_dataset = LyricsDataset(inputs)
eval_dataset = LyricsDataset(eval_inputs)
test_dataset = LyricsDataset(test_inputs)

In [14]:
# move our model to the selected device
model.to(device)
# activate training mode
model.train()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(21129, 768)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(21129, 768)
      (embed_positions): BartLearnedPositionalEmbedding(514, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
     

In [15]:
from transformers import AdamW
# initialize optimizer
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)


In [16]:
from datetime import datetime

# datetime object containing current date and time
time_stamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

In [17]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

#args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", per_device_train_batch_size=4)

args = Seq2SeqTrainingArguments(
    output_dir = f"/content/drive/MyDrive/MSc_Project/args/Method_3/{time_stamp}",
    do_predict = True,
    #per_device_train_batch_size = 8,
    num_train_epochs = 5,
    evaluation_strategy ='steps',
    eval_steps = 2000,
    save_strategy ='steps',
    save_steps = 2000,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    load_best_model_at_end = True,
    predict_with_generate = True
)

In [18]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #compute_metrics=accuracy_metric,
)

In [19]:
train_result = trainer.train()
trainer.save_model()
metrics = train_result.metrics

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


***** Running training *****
  Num examples = 196082
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 122555
  """
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss
2000,0.1502,0.146535
4000,0.1384,0.140789
6000,0.1419,0.137112
8000,0.138,0.135239
10000,0.134,0.133233
12000,0.133,0.128825
14000,0.1222,0.127449
16000,0.1192,0.125967
18000,0.1185,0.125418
20000,0.1175,0.123092


***** Running Evaluation *****
  Num examples = 49023
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
Saving model checkpoint to /content/drive/MyDrive/MSc_Project/args/Method_3/2022-09-01 12:50:47/checkpoint-2000
Configuration saved in /content/drive/MyDrive/MSc_Project/args/Method_3/2022-09-01 12:50:47/checkpoint-2000/config.json
Model weights saved in /content/drive/MyDrive/MSc_Project/args/Method_3/2022-09-01 12:50:47/checkpoint-2000/pytorch_model.bin
  """
***** Running Evaluation *****
  Num examples = 49023
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditional

***** train metrics *****
  epoch                    =        5.0
  total_flos               = 69592115GF
  train_loss               =     0.0797
  train_runtime            = 6:45:46.05
  train_samples_per_second =      40.27
  train_steps_per_second   =      5.034


In [20]:
metrics = trainer.evaluate()
try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")
metrics["perplexity"] = perplexity

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 49023
  Batch size = 8
  """
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.


***** eval metrics *****
  epoch                   =        5.0
  eval_loss               =     0.1182
  eval_runtime            = 0:02:51.47
  eval_samples_per_second =    285.894
  eval_steps_per_second   =     35.738
  perplexity              =     1.1254
