In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# install all needed libraries
!pip install transformers
!pip install datasets
!pip install pypinyin
!pip install pkuseg
!pip install evaluate
!pip install difflib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 60.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 90.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 5.0 

In [3]:
# import libraries for transformers, segmentation, pinyin and etc...
from transformers import BertTokenizer, BartForConditionalGeneration, GPT2LMHeadModel, TextGenerationPipeline
import pkuseg
from pypinyin import Style, lazy_pinyin, pinyin
import string
import numpy as np
import torch
import math
import random
import evaluate 
import os

In [4]:
# connect to cuda if available
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# set the tokenizer and model using pretrained model from huggingface
tokenizer = BertTokenizer.from_pretrained("fnlp/bart-base-chinese")
model = BartForConditionalGeneration.from_pretrained("fnlp/bart-base-chinese")
seg = pkuseg.pkuseg()


Downloading vocab.txt:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Downloading pytorch_model.bin:   0%|          | 0.00/570M [00:00<?, ?B/s]

In [5]:
# Get the alphabet of capital letters
alphabet = string.ascii_letters[26:]

# The punctuation list
PUNCS = set([",", ".", "?", "!", ":", "，", "。", "？", "！", "："])

# The yunmu dictionary for rhyming: all yunmus in the same number group rhyme.
yunmus= {
            "0":["a", "ia", "ua", "va", "üa"],
            "1":["e", "o", "uo", "ie", "ue", "üe", "ve"],
            "2":["u"],
            "3":["i", "ü", "v"],
            "4":["ai", "uai"],
            "5":["ao", "iao"],
            "6":["ou", "iu", "iou"],
            "7":["an", "ian", "uan", "üan", "van"],
            "8":["en", "in", "un", "ün", "vn"],
            "9":["ang", "iang", "uang"],
            "10":["eng", "ing", "ueng", "ong", "iong"],
            "11":["er"],
            "12":["ei", "ui", "uei", "vei"],
           }

yun2id = {}
for yid, yws in yunmus.items():
    for w in yws:
        yun2id[w] = yid


In [6]:
# adding special mask tokens for each rhyming group 
num_added_toks = tokenizer.add_tokens(['[A]'])
num_added_toks = tokenizer.add_tokens(['[B]'])
num_added_toks = tokenizer.add_tokens(['[C]'])
num_added_toks = tokenizer.add_tokens(['[D]'])
num_added_toks = tokenizer.add_tokens(['[E]'])
num_added_toks = tokenizer.add_tokens(['[F]'])
num_added_toks = tokenizer.add_tokens(['[G]'])
num_added_toks = tokenizer.add_tokens(['[H]'])
num_added_toks = tokenizer.add_tokens(['[I]'])
num_added_toks = tokenizer.add_tokens(['[J]'])
num_added_toks = tokenizer.add_tokens(['[K]'])
num_added_toks = tokenizer.add_tokens(['[L]'])
num_added_toks = tokenizer.add_tokens(['[M]'])
model.resize_token_embeddings(len(tokenizer)) 

Embedding(21141, 768)

In [7]:
# Return the yunmu of the last word in the input text
def yunmu_name(text):
    w = text[-1]
    if w in PUNCS and len(text) > 1:
        w = text[-2]
    yunmu = lazy_pinyin(w, style=Style.FINALS)
    yunmu = yunmu[0]
    return yunmu

# Return the corresponding yunmu_id of the yunmu
def yunmu_id(yunmu):
    if yunmu in yun2id:
        id = yun2id[yunmu]
    else:
        id = '-1'
    return id

# Return the corresponding schema of the sentences
def sents_to_schema(sents):
    m = 0
    all_id = []
    schema = ''
    for sent in sents:
        y_id = int(yunmu_id(yunmu_name(sent)))
        if y_id in all_id:
            schema += schema[all_id.index(y_id)]
        else:
            schema += alphabet[m]
            m += 1
        all_id.append(y_id)
    return schema

# clean the inputs, get rid of the spaces and punctuations and put the text into sentences in a list
def clean_text(text):
    text = text.replace(" ", "")
    for punc in PUNCS:
        text = text.replace(punc, " ")
    sents = text.split()
    return sents


def encode_text(text):
    sents = clean_text(text)
    schema = sents_to_schema(sents)
    new_sents = []
    for i in range(len(sents)):
        ori_text = sents[i]
        segmentation = seg.cut(ori_text)
        mask_word = segmentation[-1]
        new_sents.append(ori_text[:len(ori_text)-len(mask_word)] + '[' + schema[i] + ']')
    new_text = '，'.join(new_sents)
    return new_text


In [8]:
# Evaluation model for perplexity
from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline
eval_tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-lyric")
eval_model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-lyric")

def eval_ppl(text):
    encodings = eval_tokenizer(text, return_tensors="pt")
    max_length = eval_model.config.n_positions
    length = encodings.input_ids.size(1)

    trg_len = length
    input_ids = encodings.input_ids[:, length - max_length:]
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = eval_model(input_ids, labels=target_ids)
        neg_log_likelihood = outputs[0] * trg_len

    ppl = torch.exp(neg_log_likelihood / length)
    return ppl

Downloading vocab.txt:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/253 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/401M [00:00<?, ?B/s]

In [9]:
data_path = r'/content/drive/MyDrive/MSc_Project/dataset/train_eval_test'

train_text = []
train_label = []
with open(os.path.join(data_path, 'train_data.txt'), 'r') as td:
    for line in td:
        line = line.rstrip()
        train_label.append(line)
        train_text.append(encode_text(line))

eval_text = []
eval_label = []
with open(os.path.join(data_path, 'eval_data.txt'), 'r') as ed:
    for line in ed:
        line = line.rstrip()
        eval_label.append(line)
        eval_text.append(encode_text(line))

test_text = []
test_label = []
with open(os.path.join(data_path, 'test_data.txt'), 'r') as ttd:
    for line in ttd:
        line = line.rstrip()
        test_label.append(line)
        test_text.append(encode_text(line))


'''
with open('/content/drive/MyDrive/MSc_Project/dataset/lyric.txt', 'r') as fp:
    text = fp.read().split('\n')

for i in range(len(text)):
    text[i] = encode_text(text[i])
'''

"\nwith open('/content/drive/MyDrive/MSc_Project/dataset/lyric.txt', 'r') as fp:\n    text = fp.read().split('\n')\n\nfor i in range(len(text)):\n    text[i] = encode_text(text[i])\n"

In [10]:
inputs = tokenizer(train_text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
eval_inputs = tokenizer(eval_text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
test_inputs = tokenizer(test_text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')

labels = tokenizer(train_label, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
eval_labels = tokenizer(eval_label, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
test_labels = tokenizer(test_label, return_tensors='pt', max_length=128, truncation=True, padding='max_length')

In [11]:
inputs['labels'] = labels.input_ids.detach().clone()
eval_inputs['labels'] = eval_labels.input_ids.detach().clone()
test_inputs['labels'] = test_labels.input_ids.detach().clone()

#inputs.keys()

In [12]:
class LyricsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [13]:
train_dataset = LyricsDataset(inputs)
eval_dataset = LyricsDataset(eval_inputs)
test_dataset = LyricsDataset(test_inputs)

In [14]:
# move our model to the selected device
model.to(device)
# activate training mode
model.train()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(21141, 768)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(21141, 768)
      (embed_positions): BartLearnedPositionalEmbedding(514, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
     

In [15]:
from transformers import AdamW
# initialize optimizer
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)


In [16]:
from datetime import datetime

# datetime object containing current date and time
time_stamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

In [17]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

#args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", per_device_train_batch_size=4)

args = Seq2SeqTrainingArguments(
    output_dir = f"/content/drive/MyDrive/MSc_Project/args/Method_2/{time_stamp}",
    do_predict = True,
    #per_device_train_batch_size = 8,
    num_train_epochs = 5,
    evaluation_strategy ='steps',
    eval_steps = 300,
    save_strategy ='steps',
    save_steps = 300,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    load_best_model_at_end = True,
    predict_with_generate = True
)

In [18]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #compute_metrics=accuracy_metric,
)

In [None]:
train_result = trainer.train()
trainer.save_model()
metrics = train_result.metrics

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


***** Running training *****
  Num examples = 38102
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 23815
  """
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss
300,No log,0.644915
600,0.718800,0.626337
900,0.718800,0.62539
1200,0.649200,0.606662
1500,0.643100,0.600586
1800,0.643100,0.593948
2100,0.624100,0.589224
2400,0.624100,0.583807
2700,0.621200,0.578082
3000,0.608600,0.57646


***** Running Evaluation *****
  Num examples = 9525
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
Saving model checkpoint to /content/drive/MyDrive/MSc_Project/args/Method_2/2022-08-31 14:10:24/checkpoint-300
Configuration saved in /content/drive/MyDrive/MSc_Project/args/Method_2/2022-08-31 14:10:24/checkpoint-300/config.json
Model weights saved in /content/drive/MyDrive/MSc_Project/args/Method_2/2022-08-31 14:10:24/checkpoint-300/pytorch_model.bin
  """
***** Running Evaluation *****
  Num examples = 9525
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGener

In [None]:
metrics = trainer.evaluate()
try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")
metrics["perplexity"] = perplexity

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
import difflib
text = '冷咖啡离开了杯垫，我忍住的情绪在很后面，决定一个人走，我没有后路可退'
new_text = encode_text(text)


In [None]:
new_text

'冷咖啡离开了[MASK]，我忍住的情绪在很[MASK]，决定一个人[MASK]，我没有后路可[MASK][SEP]AABC'

In [None]:
tokenizer = BertTokenizer.from_pretrained("fnlp/bart-base-chinese")
import os
PATH = r'/content/drive/MyDrive/MSc_Project/args/Method_1/2022-08-30 20:13:01/checkpoint-4500'
#tokenizer = BertTokenizer.from_pretrained(os.path.join(PATH, 'vocab.txt'), local_files_only=True)
model = BartForConditionalGeneration.from_pretrained(os.path.join(PATH, 'pytorch_model.bin'),config=os.path.join(PATH, 'config.json'), local_files_only=True)

batch = tokenizer(new_text, return_tensors="pt")
generated_ids = model.generate(batch["input_ids"], num_return_sequences = 50, max_length = 150, do_sample = True, temperature = 1.0, output_scores = True, return_dict_in_generate=True)
result = tokenizer.batch_decode(generated_ids["sequences"], skip_special_tokens=True)
result = [s.replace(" ", "") for s in result]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'BertTokenizer'.


OSError: ignored

In [None]:
list_of_rhyme = ['AAAAAAAAAA', 'AABBCCDDEE', 'ABABABABAB', 'ABCDEABCDE', 'ABCDEFGHIJ']

def stat_metric(rhyme_metric):
  avg = sum(rhyme_metric)/len(rhyme_metric)
  maximum = max(rhyme_metric)
  upper = sum(i >= 0.7 for i in rhyme_metric)
  middle = sum(i >= 0.5 for i in rhyme_metric)
  return [avg, maximum, upper, middle]
#round(avg,3)

def rhyme_stat(list_text, schema):
  rhyme_metric = []
  for i in range(len(result)):
    new_schema = sents_to_schema(clean_text(result[i]))
    new_schema = new_schema[:len(schema)]
    temp = difflib.SequenceMatcher(None,schema,new_schema).ratio()
    rhyme_metric.append(temp)
    ppl = eval_ppl(result[i])
  return stat_metric(rhyme_metric).append(ppl)

In [None]:
#EXAMPLE 1:
EX_1 = []
for schema in list_of_rhyme:
  new_text = new_text.replace(new_text[-10:],schema)
  batch = tokenizer(new_text, return_tensors="pt")
  generated_ids = model.generate(batch["input_ids"], num_return_sequences = 50, max_length = 200, do_sample = True, temperature = 1.0, output_scores = True, return_dict_in_generate=True)
  result = tokenizer.batch_decode(generated_ids["sequences"], skip_special_tokens=True)
  result = [s.replace(" ", "") for s in result]
  EX_1.append(rhyme_stat(result, schema))

In [None]:
EX_1

[[0.16873684210526307, 0.5263157894736842, 0, 1],
 [0.5437894736842106, 0.7368421052631579, 1, 45],
 [0.5209473684210528, 0.631578947368421, 0, 36],
 [0.5502105263157895, 0.7, 6, 49],
 [0.42915789473684185, 0.6, 0, 11]]

In [None]:
#EXAMPLE 2:
EX_2 = []
for schema in list_of_rhyme:
  new_text = new_text.replace(new_text[-10:],schema)
  batch = tokenizer(new_text, return_tensors="pt")
  generated_ids = model.generate(batch["input_ids"], num_return_sequences = 50, max_length = 200, do_sample = True, temperature = 1.0, output_scores = True, return_dict_in_generate=True)
  result = tokenizer.batch_decode(generated_ids["sequences"], skip_special_tokens=True)
  result = [s.replace(" ", "") for s in result]
  EX_2.append(rhyme_stat(result, schema))

In [None]:
EX_2

[[0.15210526315789463, 0.6, 0, 3],
 [0.5451578947368423, 0.7, 2, 46],
 [0.5357894736842108, 0.631578947368421, 0, 38],
 [0.5456842105263159, 0.7368421052631579, 4, 49],
 [0.4354736842105261, 0.7, 1, 16]]

In [None]:
#EXAMPLE 3:
EX_3 = []
for schema in list_of_rhyme:
  new_text = new_text.replace(new_text[-10:],schema)
  batch = tokenizer(new_text, return_tensors="pt")
  generated_ids = model.generate(batch["input_ids"], num_return_sequences = 50, max_length = 200, do_sample = True, temperature = 1.0, output_scores = True, return_dict_in_generate=True)
  result = tokenizer.batch_decode(generated_ids["sequences"], skip_special_tokens=True)
  result = [s.replace(" ", "") for s in result]
  EX_3.append(rhyme_stat(result, schema))

In [None]:
EX_3

[[0.14599999999999994, 0.631578947368421, 0, 3],
 [0.5370526315789474, 0.7368421052631579, 4, 43],
 [0.5431578947368423, 0.7, 1, 40],
 [0.5506315789473684, 0.7, 7, 47],
 [0.4272631578947369, 0.6, 0, 11]]

In [None]:
import numpy as np
   
# data rows of csv file 
rows = [ ['Nikhil', 'COE', '2', '9.0'], 
         ['Sanchit', 'COE', '2', '9.1'], 
         ['Aditya', 'IT', '2', '9.3'], 
         ['Sagar', 'SE', '1', '9.5'], 
         ['Prateek', 'MCE', '3', '7.8'], 
         ['Sahil', 'EP', '2', '9.1']] 
  
# using the savetxt 
# from the numpy module
np.savetxt(os.path.join(PATH, 'simple_test_result.csv'), 
           rows,
           delimiter =", ", 
           fmt ='% s')

In [None]:
EXAMPLE 1: 0.580421052631579  0.7
EXAMPLE 4: 0.5808658008658013 0.6666666666666666
EXAMPLE 1gai: 0.502 0.7 'AABBCCDDEE'
EXAMPLE 1gai: 0.502 0.6 'ABABABABAB'
EXAMPLE 1gai: 0.412 0.7 'AAAAAAAAAA' 2/17
EXAMPLE 1gai: 0.488 0.7 'AABBCCDDEE' 3/32
EXAMPLE 1gai: 0.507 0.7 'AABBAABBAA' 2/36
EXAMPLE 1gai: 0.461 0.7 'ABCDEFGHIJ' 1/22

In [None]:
'AABBCCDDEE' 'ABABCDCDEE'

In [None]:
'''
TEXT 1: 
text = '最美的不是下雨天，是曾与你躲过雨的屋檐，泪流过的脸，是我记忆模糊一片，如果爱的路上没有了你，会不会有爱的晴天，如果没有你，我会在何处安身，哪怕一直走到了最后一个人，如果没有你'
list_of_rhyme = ['AAAAAAAAAA', 'AABBCCDDEE', 'ABABABABAB', 'ABCDEABCDE', 'ABCDEFGHIJ']

EX_1 = [[0.3747368421052632, 0.7, 1, 15],
 [0.5046315789473685, 0.7, 3, 36],
 [0.5144210526315791, 0.6, 0, 39],
 [0.5249473684210527, 0.7, 2, 42],
 [0.45799999999999985, 0.7, 1, 22]]

EX_2 = [[0.40988888888888886, 0.7, 1, 18],
 [0.4885263157894737, 0.7, 1, 37],
 [0.5180000000000001, 0.6, 0, 37],
 [0.522, 0.7, 1, 41],
 [0.4624999999999999, 0.7, 1, 25]]

 EX_3 = [[0.39, 0.7, 1, 14],
 [0.4764210526315791, 0.7, 1, 27],
 [0.5224210526315789, 0.6, 0, 39],
 [0.5199999999999999, 0.7, 4, 38],
 [0.45042105263157883, 0.7, 1, 21]]
'''

In [None]:
'''
TEXT 2:
text = '冷咖啡离开了杯垫，我忍住的情绪在很后面，决定一个人走，我没有后路可退，只能承受悲哀来陪，我的痛谁会在意，如果相遇是个意外，我愿意放手让你走，我不想让自己在沉醉，在悲伤中无法自拔'

EX_1 = [[0.16873684210526307, 0.5263157894736842, 0, 1],
 [0.5437894736842106, 0.7368421052631579, 1, 45],
 [0.5209473684210528, 0.631578947368421, 0, 36],
 [0.5502105263157895, 0.7, 6, 49],
 [0.42915789473684185, 0.6, 0, 11]]

EX_2 = [[0.15210526315789463, 0.6, 0, 3],
 [0.5451578947368423, 0.7, 2, 46],
 [0.5357894736842108, 0.631578947368421, 0, 38],
 [0.5456842105263159, 0.7368421052631579, 4, 49],
 [0.4354736842105261, 0.7, 1, 16]]

EX_3 = [[0.14599999999999994, 0.631578947368421, 0, 3],
 [0.5370526315789474, 0.7368421052631579, 4, 43],
 [0.5431578947368423, 0.7, 1, 40],
 [0.5506315789473684, 0.7, 7, 47],
 [0.4272631578947369, 0.6, 0, 11]]

'''