In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# install all needed libraries
!pip install transformers
!pip install datasets
!pip install pypinyin
!pip install pkuseg
!pip install evaluate
!pip install bert_score
!pip install statistics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 57.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 63.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 5.1 

In [None]:
# import libraries for transformers, segmentation, pinyin and etc...
from transformers import BertTokenizer, BartForConditionalGeneration, GPT2LMHeadModel, TextGenerationPipeline
import pkuseg
from pypinyin import Style, lazy_pinyin, pinyin
import string
import numpy as np
import torch
import math
import random
import evaluate 
import os

In [None]:
# connect to cuda if available
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# set the tokenizer and model using pretrained model from huggingface
tokenizer = BertTokenizer.from_pretrained("fnlp/bart-base-chinese")
model = BartForConditionalGeneration.from_pretrained("fnlp/bart-base-chinese")
seg = pkuseg.pkuseg()


Downloading vocab.txt:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Downloading pytorch_model.bin:   0%|          | 0.00/570M [00:00<?, ?B/s]

In [None]:
# Get the alphabet of capital letters
alphabet = string.ascii_letters[26:]

# The punctuation list
PUNCS = set([",", ".", "?", "!", ":", "，", "。", "？", "！", "："])

# The yunmu dictionary for rhyming: all yunmus in the same number group rhyme.
yunmus= {
            "0":["a", "ia", "ua", "va", "üa"],
            "1":["e", "o", "uo", "ie", "ue", "üe", "ve"],
            "2":["u"],
            "3":["i", "ü", "v"],
            "4":["ai", "uai"],
            "5":["ao", "iao"],
            "6":["ou", "iu", "iou"],
            "7":["an", "ian", "uan", "üan", "van"],
            "8":["en", "in", "un", "ün", "vn"],
            "9":["ang", "iang", "uang"],
            "10":["eng", "ing", "ueng", "ong", "iong"],
            "11":["er"],
            "12":["ei", "ui", "uei", "vei"],
           }

yun2id = {}
for yid, yws in yunmus.items():
    for w in yws:
        yun2id[w] = yid


In [None]:
# Return the yunmu of the last word in the input text
def yunmu_name(text):
    w = text[-1]
    if w in PUNCS and len(text) > 1:
        w = text[-2]
    yunmu = lazy_pinyin(w, style=Style.FINALS)
    yunmu = yunmu[0]
    return yunmu

# Return the corresponding yunmu_id of the yunmu
def yunmu_id(yunmu):
    if yunmu in yun2id:
        id = yun2id[yunmu]
    else:
        id = '-1'
    return id

# Return the corresponding schema of the sentences
def sents_to_schema(sents):
    m = 0
    all_id = []
    schema = ''
    for sent in sents:
        y_id = int(yunmu_id(yunmu_name(sent)))
        if y_id in all_id:
            schema += schema[all_id.index(y_id)]
        else:
            schema += alphabet[m]
            m += 1
        all_id.append(y_id)
    return schema

# clean the inputs, get rid of the spaces and punctuations and put the text into sentences in a list
def clean_text(text):
    text = text.replace(" ", "")
    for punc in PUNCS:
        text = text.replace(punc, " ")
    sents = text.split()
    return sents

def encode_text(text):
    sents = clean_text(text)
    schema = sents_to_schema(sents)
    new_sents = []
    for i in range(len(sents)):
        ori_text = sents[i]
        segmentation = seg.cut(ori_text)
        mask_word = segmentation[random.randint(0, len(segmentation)-1)]
        new_sents.append(ori_text.replace(mask_word, '[MASK]'))
    new_text = '，'.join(new_sents)
    return new_text


In [None]:
data_path = r'/content/drive/MyDrive/MSc_Project/dataset/train_eval_test'

train_text = []
train_label = []
with open(os.path.join(data_path, 'train_data.txt'), 'r') as td:
    for line in td:
        line = line.rstrip()
        train_label.append(line)
        train_text.append(encode_text(line))

eval_text = []
eval_label = []
with open(os.path.join(data_path, 'eval_data.txt'), 'r') as ed:
    for line in ed:
        line = line.rstrip()
        eval_label.append(line)
        eval_text.append(encode_text(line))

test_text = []
test_label = []
with open(os.path.join(data_path, 'test_data.txt'), 'r') as ttd:
    for line in ttd:
        line = line.rstrip()
        test_label.append(line)
        test_text.append(encode_text(line))


'''
with open('/content/drive/MyDrive/MSc_Project/dataset/lyric.txt', 'r') as fp:
    text = fp.read().split('\n')

for i in range(len(text)):
    text[i] = encode_text(text[i])
'''

"\nwith open('/content/drive/MyDrive/MSc_Project/dataset/lyric.txt', 'r') as fp:\n    text = fp.read().split('\n')\n\nfor i in range(len(text)):\n    text[i] = encode_text(text[i])\n"

In [None]:
inputs = tokenizer(train_text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
eval_inputs = tokenizer(eval_text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
test_inputs = tokenizer(test_text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')

labels = tokenizer(train_label, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
eval_labels = tokenizer(eval_label, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
test_labels = tokenizer(test_label, return_tensors='pt', max_length=128, truncation=True, padding='max_length')

In [None]:
inputs['labels'] = labels.input_ids.detach().clone()
eval_inputs['labels'] = eval_labels.input_ids.detach().clone()
test_inputs['labels'] = test_labels.input_ids.detach().clone()

#inputs.keys()

In [None]:
class LyricsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
train_dataset = LyricsDataset(inputs)
eval_dataset = LyricsDataset(eval_inputs)
test_dataset = LyricsDataset(test_inputs)

In [None]:
# move our model to the selected device
model.to(device)
# activate training mode
model.train()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(21128, 768, padding_idx=0)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(21128, 768, padding_idx=0)
      (embed_positions): BartLearnedPositionalEmbedding(514, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, 

In [None]:
from transformers import AdamW
# initialize optimizer
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)


In [None]:
from datetime import datetime

# datetime object containing current date and time
time_stamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

#args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", per_device_train_batch_size=4)

args = Seq2SeqTrainingArguments(
    output_dir = f"/content/drive/MyDrive/MSc_Project/args/Method_1/{time_stamp}",
    do_predict = True,
    #per_device_train_batch_size = 8,
    num_train_epochs = 5,
    evaluation_strategy ='steps',
    eval_steps = 300,
    save_strategy ='steps',
    save_steps = 300,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    load_best_model_at_end = True,
    predict_with_generate = True
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #compute_metrics=accuracy_metric,
)

In [None]:
train_result = trainer.train()
trainer.save_model()
metrics = train_result.metrics

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


***** Running training *****
  Num examples = 38102
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 23815
  """
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss
300,No log,0.533243
600,0.579900,0.510484
900,0.579900,0.505614
1200,0.528400,0.499768
1500,0.524900,0.492751
1800,0.524900,0.487675
2100,0.521200,0.485748
2400,0.521200,0.480519
2700,0.514300,0.478258
3000,0.507300,0.474105


***** Running Evaluation *****
  Num examples = 9525
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
Saving model checkpoint to /content/drive/MyDrive/MSc_Project/args/Method_1/2022-09-01 12:49:23/checkpoint-300
Configuration saved in /content/drive/MyDrive/MSc_Project/args/Method_1/2022-09-01 12:49:23/checkpoint-300/config.json
Model weights saved in /content/drive/MyDrive/MSc_Project/args/Method_1/2022-09-01 12:49:23/checkpoint-300/pytorch_model.bin
  """
***** Running Evaluation *****
  Num examples = 9525
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGener

***** train metrics *****
  epoch                    =        5.0
  total_flos               = 13522907GF
  train_loss               =     0.3797
  train_runtime            = 2:31:08.43
  train_samples_per_second =     21.008
  train_steps_per_second   =      2.626


In [None]:
metrics = trainer.evaluate()
try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")
metrics["perplexity"] = perplexity

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 9525
  Batch size = 8
  """
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.


***** eval metrics *****
  epoch                   =        5.0
  eval_loss               =     0.4511
  eval_runtime            = 0:00:53.58
  eval_samples_per_second =    177.767
  eval_steps_per_second   =     22.228
  perplexity              =       1.57


In [None]:
text = '冷咖啡离开了杯垫，我忍住的情绪在很后面，决定一个人走，我没有后路可退，只能承受悲哀来陪，我的痛谁会在意，如果相遇是个意外，我愿意放手让你走，我不想让自己在沉醉，在悲伤中无法自拔'
new_text = encode_text(text)
import difflib

In [None]:
text

'冷咖啡离开了杯垫，我忍住的情绪在很后面，决定一个人走，我没有后路可退，只能承受悲哀来陪，我的痛谁会在意，如果相遇是个意外，我愿意放手让你走，我不想让自己在沉醉，在悲伤中无法自拔'

In [None]:
tokenizer = BertTokenizer.from_pretrained("fnlp/bart-base-chinese")
import os
PATH = r'/content/drive/MyDrive/MSc_Project/args/Method_1/2022-08-30 21:17:21/checkpoint-8500'
#tokenizer = BertTokenizer.from_pretrained(os.path.join(PATH, 'vocab.txt'), local_files_only=True)
model = BartForConditionalGeneration.from_pretrained(os.path.join(PATH, 'pytorch_model.bin'),config=os.path.join(PATH, 'config.json'), local_files_only=True)

batch = tokenizer(new_text, return_tensors="pt")
generated_ids = model.generate(batch["input_ids"], num_return_sequences = 50, max_length = 200, do_sample = True, temperature = 1.0, output_scores = True, return_dict_in_generate=True)
result = tokenizer.batch_decode(generated_ids["sequences"], skip_special_tokens=True)
result = [s.replace(" ", "") for s in result]

loading file https://huggingface.co/fnlp/bart-base-chinese/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/feb7fcba07a5cd52dab8daea7c7654f9f450cf4e2586eb946df713da5b44d5e4.accd894ff58c6ff7bd4f3072890776c14f4ea34fcc08e79cd88c2d157756dceb
loading file https://huggingface.co/fnlp/bart-base-chinese/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/fnlp/bart-base-chinese/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/c7a2ad3ce29650bde9ea8929d9d4414f1472f2eaee89e1700413a60725333838.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/fnlp/bart-base-chinese/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/e8916bb2271881244e34cad9e88d11ef38394196b1d328d76773fde6934c0ef9.4930bdcbc6f75dead7cdeadc249fdb55dcb3cd75bdcee68ee5fcd8aeb6e6e359
loading configuration file https://huggingface.co/fnlp/bart-base-chinese/

In [None]:
list_of_rhyme = ['AAAAAAAAAA', 'AABBCCDDEE', 'ABABABABAB', 'ABCDEABCDE', 'ABCDEFGHIJ']

def stat_metric(rhyme_metric):
  avg = sum(rhyme_metric)/len(rhyme_metric)
  maximum = max(rhyme_metric)
  upper = sum(i >= 0.7 for i in rhyme_metric)
  middle = sum(i >= 0.5 for i in rhyme_metric)
  return [avg, maximum, upper, middle]
#round(avg,3)

def rhyme_stat(list_text, schema):
  rhyme_metric = []
  for i in range(len(result)):
    new_schema = sents_to_schema(clean_text(result[i]))
    new_schema = new_schema[:len(schema)]
    temp = difflib.SequenceMatcher(None,schema,new_schema).ratio()
    rhyme_metric.append(temp)
  return stat_metric(rhyme_metric)

In [None]:
#EXAMPLE 1:
EX_1 = []
for schema in list_of_rhyme:
  EX_1.append(rhyme_stat(result, schema))

In [None]:
EX_1

[[0.09999999999999996, 0.1, 0, 0],
 [0.7700000000000002, 0.8, 50, 50],
 [0.39999999999999986, 0.4, 0, 0],
 [0.6999999999999998, 0.7, 50, 50],
 [0.5300000000000001, 0.6, 0, 50]]

In [None]:
#EXAMPLE 1:
EX_1 = [[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0]]
ALL_EX_1 = []
iteration = 5
for i in range(iteration):
  batch = tokenizer(new_text, return_tensors="pt")
  generated_ids = model.generate(batch["input_ids"], num_return_sequences = 50, max_length = 200, do_sample = True, temperature = 1.0, output_scores = True, return_dict_in_generate=True)
  result = tokenizer.batch_decode(generated_ids["sequences"], skip_special_tokens=True)
  result = [s.replace(" ", "") for s in result]
  ALL_EX = []
  for j in range(len(list_of_rhyme)):  
    NEW_EX = rhyme_stat(result, list_of_rhyme[j])
    ALL_EX.append(NEW_EX)
    EX_1[j] = [sum(value) for value in zip(EX_1[j], NEW_EX)]
  ALL_EX_1.append(ALL_EX)
EX_1 = (np.array(EX_1) / iteration).tolist()

In [None]:
EX_1

[[0.09999999999999996, 0.1, 0.0, 0.0],
 [0.7735999999999998, 0.8, 50.0, 50.0],
 [0.39999999999999986, 0.4, 0.0, 0.0],
 [0.6999999999999998, 0.7, 50.0, 50.0],
 [0.5264000000000001, 0.6, 0.0, 50.0]]

In [None]:
ALL_EX_1

[[[0.09999999999999996, 0.1, 0, 0],
  [0.7759999999999999, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.524, 0.6, 0, 50]],
 [[0.09999999999999996, 0.1, 0, 0],
  [0.77, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.5300000000000001, 0.6, 0, 50]],
 [[0.09999999999999996, 0.1, 0, 0],
  [0.7839999999999998, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.516, 0.6, 0, 50]],
 [[0.09999999999999996, 0.1, 0, 0],
  [0.7739999999999996, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.5260000000000001, 0.6, 0, 50]],
 [[0.09999999999999996, 0.1, 0, 0],
  [0.7639999999999999, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.5360000000000001, 0.6, 0, 50]]]

In [None]:
#EXAMPLE 2:
EX_2 = [[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0]]
ALL_EX_2 = []
iteration = 5
for i in range(iteration):
  batch = tokenizer(new_text, return_tensors="pt")
  generated_ids = model.generate(batch["input_ids"], num_return_sequences = 50, max_length = 200, do_sample = True, temperature = 1.0, output_scores = True, return_dict_in_generate=True)
  result = tokenizer.batch_decode(generated_ids["sequences"], skip_special_tokens=True)
  result = [s.replace(" ", "") for s in result]
  ALL_EX = []
  for j in range(len(list_of_rhyme)):  
    NEW_EX = rhyme_stat(result, list_of_rhyme[j])
    ALL_EX.append(NEW_EX)
    EX_2[j] = [sum(value) for value in zip(EX_2[j], NEW_EX)]
  ALL_EX_2.append(ALL_EX)
EX_2 = (np.array(EX_2) / iteration).tolist()

In [None]:
EX_2

[[0.09999999999999996, 0.1, 0.0, 0.0],
 [0.7767999999999999, 0.8, 50.0, 50.0],
 [0.39999999999999986, 0.4, 0.0, 0.0],
 [0.6999999999999998, 0.7, 50.0, 50.0],
 [0.5232000000000001, 0.6, 0.0, 50.0]]

In [None]:
ALL_EX_2

[[[0.09999999999999996, 0.1, 0, 0],
  [0.7759999999999997, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.5240000000000002, 0.6, 0, 50]],
 [[0.09999999999999996, 0.1, 0, 0],
  [0.7819999999999997, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.518, 0.6, 0, 50]],
 [[0.09999999999999996, 0.1, 0, 0],
  [0.7719999999999999, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.5280000000000001, 0.6, 0, 50]],
 [[0.09999999999999996, 0.1, 0, 0],
  [0.7739999999999999, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.5260000000000001, 0.6, 0, 50]],
 [[0.09999999999999996, 0.1, 0, 0],
  [0.7800000000000001, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.5200000000000001, 0.6, 0, 50]]]

In [None]:
#EXAMPLE 3:
EX_3 = [[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0]]
ALL_EX_3 = []
iteration = 5
for i in range(iteration):
  batch = tokenizer(new_text, return_tensors="pt")
  generated_ids = model.generate(batch["input_ids"], num_return_sequences = 50, max_length = 200, do_sample = True, temperature = 1.0, output_scores = True, return_dict_in_generate=True)
  result = tokenizer.batch_decode(generated_ids["sequences"], skip_special_tokens=True)
  result = [s.replace(" ", "") for s in result]
  ALL_EX = []
  for j in range(len(list_of_rhyme)):  
    NEW_EX = rhyme_stat(result, list_of_rhyme[j])
    ALL_EX.append(NEW_EX)
    EX_3[j] = [sum(value) for value in zip(EX_3[j], NEW_EX)]
  ALL_EX_3.append(ALL_EX)
EX_3 = (np.array(EX_3) / iteration).tolist()

In [None]:
EX_3

[[0.10079999999999996, 0.13999999999999999, 0.0, 0.0],
 [0.7775999999999998, 0.8, 50.0, 50.0],
 [0.3991999999999999, 0.4, 0.0, 0.0],
 [0.6995999999999998, 0.7, 49.8, 50.0],
 [0.5224, 0.6, 0.0, 50.0]]

In [None]:
ALL_EX_3

[[[0.10199999999999995, 0.2, 0, 0],
  [0.77, 0.8, 50, 50],
  [0.3979999999999999, 0.4, 0, 0],
  [0.6979999999999998, 0.7, 49, 50],
  [0.5300000000000001, 0.6, 0, 50]],
 [[0.09999999999999996, 0.1, 0, 0],
  [0.7799999999999998, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.52, 0.6, 0, 50]],
 [[0.09999999999999996, 0.1, 0, 0],
  [0.7759999999999999, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.524, 0.6, 0, 50]],
 [[0.09999999999999996, 0.1, 0, 0],
  [0.7799999999999998, 0.8, 50, 50],
  [0.39999999999999986, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.52, 0.6, 0, 50]],
 [[0.10199999999999995, 0.2, 0, 0],
  [0.7819999999999999, 0.8, 50, 50],
  [0.3979999999999999, 0.4, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.518, 0.6, 0, 50]]]

In [None]:
'''
#TEXT 1:
text = '最美的不是下雨天，是曾与你躲过雨的屋檐，泪流过的脸，是我记忆模糊一片，如果爱的路上没有了你，会不会有爱的晴天，如果没有你，我会在何处安身，哪怕一直走到了最后一个人，如果没有你'

EX_1 = [[0.39679999999999993, 0.45999999999999996, 0.0, 1.0],
 [0.7179999999999999, 0.8, 49.2, 50.0],
 [0.5204, 0.62, 0.2, 30.2],
 [0.5176000000000002, 0.6, 0.0, 49.2],
 [0.418, 0.5, 0.0, 9.8]]

EX_2 = [[0.39679999999999993, 0.5, 0.0, 1.8],
 [0.7184000000000001, 0.8, 49.0, 50.0],
 [0.5112, 0.62, 0.4, 27.8],
 [0.5184000000000001, 0.6, 0.0, 49.0],
 [0.4183999999999998, 0.5, 0.0, 10.2]]

EX_3 = [[0.3971999999999999, 0.45999999999999996, 0.0, 0.6],
 [0.7168, 0.8, 49.2, 50.0],
 [0.5176000000000001, 0.62, 0.6, 29.2],
 [0.5168000000000001, 0.6, 0.0, 49.2],
 [0.41679999999999995, 0.5, 0.0, 9.2]]

ALL_EX_1 = [[[0.3979999999999999, 0.4, 0, 0],
  [0.716, 0.8, 50, 50],
  [0.5280000000000001, 0.6, 0, 32],
  [0.5160000000000001, 0.6, 0, 50],
  [0.41599999999999987, 0.5, 0, 8]],
 [[0.39199999999999996, 0.4, 0, 0],
  [0.7199999999999999, 0.8, 49, 50],
  [0.518, 0.7, 1, 29],
  [0.5200000000000001, 0.6, 0, 49],
  [0.41999999999999993, 0.5, 0, 11]],
 [[0.39999999999999986, 0.5, 0, 3],
  [0.72, 0.8, 49, 50],
  [0.508, 0.6, 0, 27],
  [0.5200000000000001, 0.6, 0, 49],
  [0.4199999999999999, 0.5, 0, 11]],
 [[0.39599999999999996, 0.5, 0, 1],
  [0.7099999999999997, 0.8, 48, 50],
  [0.5299999999999998, 0.6, 0, 33],
  [0.5100000000000001, 0.6, 0, 48],
  [0.41, 0.5, 0, 7]],
 [[0.39799999999999996, 0.5, 0, 1],
  [0.7239999999999998, 0.8, 50, 50],
  [0.5179999999999999, 0.6, 0, 30],
  [0.5220000000000001, 0.6, 0, 50],
  [0.42399999999999993, 0.5, 0, 12]]]

ALL_EX_2 = [[[0.38599999999999995, 0.5, 0, 1],
  [0.7140000000000001, 0.8, 50, 50],
  [0.508, 0.6, 0, 27],
  [0.514, 0.6, 0, 50],
  [0.4139999999999999, 0.5, 0, 7]],
 [[0.3979999999999999, 0.5, 0, 3],
  [0.7179999999999999, 0.8, 49, 50],
  [0.502, 0.6, 0, 26],
  [0.5160000000000001, 0.6, 0, 49],
  [0.418, 0.5, 0, 10]],
 [[0.39999999999999986, 0.5, 0, 1],
  [0.7320000000000003, 0.8, 50, 50],
  [0.49200000000000005, 0.6, 0, 23],
  [0.534, 0.6, 0, 50],
  [0.4319999999999997, 0.5, 0, 16]],
 [[0.3979999999999999, 0.5, 0, 2],
  [0.7040000000000001, 0.8, 47, 50],
  [0.5299999999999998, 0.7, 2, 32],
  [0.504, 0.6, 0, 47],
  [0.40399999999999997, 0.5, 0, 5]],
 [[0.4019999999999999, 0.5, 0, 2],
  [0.7240000000000002, 0.8, 49, 50],
  [0.5240000000000002, 0.6, 0, 31],
  [0.524, 0.6, 0, 49],
  [0.4239999999999998, 0.5, 0, 13]]]

ALL_EX_3 = [[[0.3999999999999999, 0.5, 0, 1],
  [0.7140000000000002, 0.8, 46, 50],
  [0.5160000000000001, 0.7, 3, 28],
  [0.5139999999999999, 0.6, 0, 46],
  [0.4139999999999999, 0.5, 0, 11]],
 [[0.39599999999999996, 0.4, 0, 0],
  [0.718, 0.8, 50, 50],
  [0.5240000000000004, 0.6, 0, 31],
  [0.5180000000000001, 0.6, 0, 50],
  [0.418, 0.5, 0, 9]],
 [[0.39399999999999996, 0.4, 0, 0],
  [0.7219999999999999, 0.8, 50, 50],
  [0.4959999999999998, 0.6, 0, 24],
  [0.5220000000000001, 0.6, 0, 50],
  [0.4219999999999999, 0.5, 0, 11]],
 [[0.39799999999999996, 0.5, 0, 1],
  [0.7120000000000001, 0.8, 50, 50],
  [0.536, 0.6, 0, 34],
  [0.512, 0.6, 0, 50],
  [0.41199999999999987, 0.5, 0, 6]],
 [[0.39799999999999996, 0.5, 0, 1],
  [0.7179999999999999, 0.8, 50, 50],
  [0.516, 0.6, 0, 29],
  [0.5180000000000001, 0.6, 0, 50],
  [0.4179999999999999, 0.5, 0, 9]]]
'''

"\n#TEXT 1:\ntext = '最美的不是下雨天，是曾与你躲过雨的屋檐，泪流过的脸，是我记忆模糊一片，如果爱的路上没有了你，会不会有爱的晴天，如果没有你，我会在何处安身，哪怕一直走到了最后一个人，如果没有你'\n\nEX_1 = [[0.39679999999999993, 0.45999999999999996, 0.0, 1.0],\n [0.7179999999999999, 0.8, 49.2, 50.0],\n [0.5204, 0.62, 0.2, 30.2],\n [0.5176000000000002, 0.6, 0.0, 49.2],\n [0.418, 0.5, 0.0, 9.8]]\n\nEX_2 = [[0.39679999999999993, 0.5, 0.0, 1.8],\n [0.7184000000000001, 0.8, 49.0, 50.0],\n [0.5112, 0.62, 0.4, 27.8],\n [0.5184000000000001, 0.6, 0.0, 49.0],\n [0.4183999999999998, 0.5, 0.0, 10.2]]\n\nEX_3 = [[0.3971999999999999, 0.45999999999999996, 0.0, 0.6],\n [0.7168, 0.8, 49.2, 50.0],\n [0.5176000000000001, 0.62, 0.6, 29.2],\n [0.5168000000000001, 0.6, 0.0, 49.2],\n [0.41679999999999995, 0.5, 0.0, 9.2]]\n\nALL_EX_1 = [[[0.3979999999999999, 0.4, 0, 0],\n  [0.716, 0.8, 50, 50],\n  [0.5280000000000001, 0.6, 0, 32],\n  [0.5160000000000001, 0.6, 0, 50],\n  [0.41599999999999987, 0.5, 0, 8]],\n [[0.39199999999999996, 0.4, 0, 0],\n  [0.7199999999999999, 0.8, 49, 50],\n  [0.518, 0.7,

In [None]:
'''
TEXT 2:
text = '冷咖啡离开了杯垫，我忍住的情绪在很后面，决定一个人走，我没有后路可退，只能承受悲哀来陪，我的痛谁会在意，如果相遇是个意外，我愿意放手让你走，我不想让自己在沉醉，在悲伤中无法自拔'

EX_1 = [[0.2003999999999999, 0.21999999999999997, 0.0, 0.0],
 [0.6999999999999998, 0.7, 50.0, 50.0],
 [0.3004000000000003, 0.32, 0.0, 0.0],
 [0.6000000000000005, 0.6, 0.0, 50.0],
 [0.6164000000000003, 0.7, 8.2, 50.0]]

EX_2 = [[0.19999999999999993, 0.2, 0.0, 0.0],
 [0.7003999999999998, 0.7200000000000001, 50.0, 50.0],
 [0.30000000000000027, 0.3, 0.0, 0.0],
 [0.6000000000000005, 0.6, 0.0, 50.0],
 [0.6120000000000003, 0.7, 6.0, 50.0]]

EX_3 = [[0.19999999999999993, 0.2, 0.0, 0.0],
 [0.6999999999999998, 0.7, 50.0, 50.0],
 [0.30000000000000027, 0.3, 0.0, 0.0],
 [0.6000000000000005, 0.6, 0.0, 50.0],
 [0.6140000000000004, 0.7, 7.0, 50.0]]

ALL_EX_1 = [[[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6180000000000004, 0.7, 9, 50]],
 [[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6140000000000003, 0.7, 7, 50]],
 [[0.20199999999999993, 0.3, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30200000000000027, 0.4, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6200000000000003, 0.7, 10, 50]],
 [[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6100000000000004, 0.7, 5, 50]],
 [[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6200000000000003, 0.7, 10, 50]]]

ALL_EX_2 = [[[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6120000000000004, 0.7, 6, 50]],
 [[0.19999999999999993, 0.2, 0, 0],
  [0.7019999999999997, 0.8, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6120000000000004, 0.7, 6, 50]],
 [[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6100000000000003, 0.7, 5, 50]],
 [[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6140000000000003, 0.7, 7, 50]],
 [[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6120000000000004, 0.7, 6, 50]]]

ALL_EX_3 = [[[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6180000000000004, 0.7, 9, 50]],
 [[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6160000000000003, 0.7, 8, 50]],
 [[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6160000000000003, 0.7, 8, 50]],
 [[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6100000000000005, 0.7, 5, 50]],
 [[0.19999999999999993, 0.2, 0, 0],
  [0.6999999999999998, 0.7, 50, 50],
  [0.30000000000000027, 0.3, 0, 0],
  [0.6000000000000005, 0.6, 0, 50],
  [0.6100000000000004, 0.7, 5, 50]]]

'''

"\nTEXT 2:\ntext = '冷咖啡离开了杯垫，我忍住的情绪在很后面，决定一个人走，我没有后路可退，只能承受悲哀来陪，我的痛谁会在意，如果相遇是个意外，我愿意放手让你走，我不想让自己在沉醉，在悲伤中无法自拔'\n\nEX_1 = [[0.2003999999999999, 0.21999999999999997, 0.0, 0.0],\n [0.6999999999999998, 0.7, 50.0, 50.0],\n [0.3004000000000003, 0.32, 0.0, 0.0],\n [0.6000000000000005, 0.6, 0.0, 50.0],\n [0.6164000000000003, 0.7, 8.2, 50.0]]\n\nEX_2 = [[0.19999999999999993, 0.2, 0.0, 0.0],\n [0.7003999999999998, 0.7200000000000001, 50.0, 50.0],\n [0.30000000000000027, 0.3, 0.0, 0.0],\n [0.6000000000000005, 0.6, 0.0, 50.0],\n [0.6120000000000003, 0.7, 6.0, 50.0]]\n\nEX_3 = [[0.19999999999999993, 0.2, 0.0, 0.0],\n [0.6999999999999998, 0.7, 50.0, 50.0],\n [0.30000000000000027, 0.3, 0.0, 0.0],\n [0.6000000000000005, 0.6, 0.0, 50.0],\n [0.6140000000000004, 0.7, 7.0, 50.0]]\n\nALL_EX_1 = [[[0.19999999999999993, 0.2, 0, 0],\n  [0.6999999999999998, 0.7, 50, 50],\n  [0.30000000000000027, 0.3, 0, 0],\n  [0.6000000000000005, 0.6, 0, 50],\n  [0.6180000000000004, 0.7, 9, 50]],\n [[0.19999999999999993, 0.2, 