In [1]:
%load_ext autoreload
%autoreload 2
import re
import numpy as np
import tensorflow as tf
import os
import sys
from tf2gpt.model import GPT
from utils.story_util import Story,Stories
from utils.progress_bar import ProgressBar
from tensorboardX import SummaryWriter
from tensorflow.keras.utils import multi_gpu_model
import random

In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
tf.keras.backend.set_floatx('float16')

In [5]:
#mirrored_strategy = tf.distribute.MirroredStrategy()

In [6]:
#with mirrored_strategy.scope():
gpt = GPT(
    vocab_size=30_000,
    layer_size=32,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=2560,
    num_attention_heads=32,
    attention_dropout=0.0,
    residual_dropout=0.0,
    train_size=499
)
gpt.load_weights('./gpt_weight_pretrain/weight_fp16')

#input_x = tf.keras.layers.Input((499,), dtype=tf.int32)
#outputs = gpt_origin(input_x)

#gpt = tf.keras.Model(inputs=input_x, outputs=outputs)
#gpt = multi_gpu_model(gpt, gpus=8)

print(tf.keras.backend.floatx(), tf.float16, tf.keras.backend.floatx() == tf.float16)
if tf.keras.backend.floatx() == tf.float16:
    for x in gpt.weights:
        assert x.dtype == tf.float16


gpt.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),  # Optimizer
    # Loss function to minimize
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    # List of metrics to monitor
)

float16 <dtype: 'float16'> True


In [7]:
from utils.gpt2_tokenizer import GPT2Tokenizer
cbpe = GPT2Tokenizer(
    'CPM-Generate/bpe_3w_new/vocab.json',
    'CPM-Generate/bpe_3w_new/merges.txt',
    model_file='CPM-Generate/bpe_3w_new/chinese_vocab.model')

In [8]:
ids = cbpe.encode('今天天气还行')
ids

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.564 seconds.
Prefix dict has been built successfully.


[837, 259, 497, 57, 8, 237]

In [9]:
def test_basic_logic():
    ids = cbpe.encode('今天天气还行')
    print(ids)
    print("+" * 20)
    for i in range(10):
        output = gpt(tf.constant([ids]))
        print(output[0].shape)
        nid = np.argmax(output[0, -1])
        ids += [nid]
        print(i, cbpe.decode(ids))
        print(np.argmax(output[0],axis=-1))
        print(cbpe.decode(np.argmax(output[0],axis=-1)))
        print('-' * 30)
test_basic_logic()

[837, 259, 497, 57, 8, 237]
++++++++++++++++++++
(6, 30000)
0 今天天气 还 行 
[  8 497 788 788 237   8]
气 不错 不错行 
------------------------------
(7, 30000)
1 今天天气 还 行 ,
[  8 497 788 788 237   8   9]
气 不错 不错行 ,
------------------------------
(8, 30000)
2 今天天气 还 行 , 我
[  8 497 788 788 237   8   9  16]
气 不错 不错行 , 我
------------------------------
(9, 30000)
3 今天天气 还 行 , 我 就
[  8 497 788 788 237   8   9  16  29]
气 不错 不错行 , 我 就
------------------------------
(10, 30000)
4 今天天气 还 行 , 我 就 想
[  8 497 788 788 237   8   9  16  29  84]
气 不错 不错行 , 我 就 想
------------------------------
(11, 30000)
5 今天天气 还 行 , 我 就 想着
[  8 497 788 788 237   8   9  16  29  84 197]
气 不错 不错行 , 我 就 想着
------------------------------
(12, 30000)
6 今天天气 还 行 , 我 就 想着 去
[  8 497 788 788 237   8   9  16  29  84 197  91]
气 不错 不错行 , 我 就 想着 去
------------------------------
(13, 30000)
7 今天天气 还 行 , 我 就 想着 去 看看
[  8 497 788 788 237   8   9  16  29  84 197  91 881]
气 不错 不错行 , 我 就 想着 去 看看
------------------------------
(14, 30000)
8 今天天气 还 

In [10]:
def get_learning_rate(learning_rate=6e-4,
                      warmup_steps=20_0000,
                      decay_steps=200_0000,
                      alpha=0.0):
    def decayed_learning_rate(step=1):
        if step <= warmup_steps:
            mult = step / float(warmup_steps)
        else:
            progress = (step - warmup_steps) / (decay_steps - warmup_steps)
            mult = 0.5 * (1 + math.cos(math.pi * progress))
            mult = max(0.1, mult)
        return learning_rate * mult
    return decayed_learning_rate

In [11]:
stories = Stories("./labeled_data/advanture_translated/processed_translated_story.txt").stories
#stories = stories[:50]
data_folder = "./labeled_data/"
txt_files = [(data_folder + i) for i in os.listdir(data_folder) if "txt" in i]
#stories = stories[:10]
stories += [Story("","").from_file(i) for i in txt_files]

In [12]:
valid_stories = Stories("./labeled_data/advanture_translated/processed_translated_story_valid.txt").stories

In [13]:
len(stories),len(valid_stories)

(308, 35)

In [14]:
import copy
def data_generator(stories, batch_size=4,sample_len=200,inf=False):
    while True:
        batch_data = []
        tmp_stories = copy.copy(stories)
        random.shuffle(tmp_stories)
        for i,one_story in enumerate(tmp_stories):
            story_content = one_story.to_dungeon_format()
            story_content = story_content.replace("<start>\n","")
            story_content = story_content.replace("\n<end>","")
            story_content = story_content.replace("\n<end>","")
            story_content = story_content.replace(" ","")
            ids = cbpe.encode(story_content)
            while ids:
                sample = ids[:sample_len]
                ids = ids[sample_len:]
                if len(sample) < sample_len:
                    sample += [0 for i in range((sample_len - len(sample)))]
                batch_data.append(sample)
                if len(batch_data) >= batch_size:
                    yield np.asarray(batch_data)
                    batch_data = []
        if not inf:
            break

In [15]:
valid_gen = data_generator(valid_stories,sample_len=500,batch_size=2,inf=True)

In [16]:
valid_gen.__next__().shape

(2, 500)

In [None]:
writer = SummaryWriter('log/finetune')
n_iter = 0
train_loss = 100
val_loss = 100
for epoch in range(200):
    print(f"Epoch {epoch + 1}")
    pb = ProgressBar(10000)
    pb.startjob()
    for x in data_generator(stories,sample_len=500,batch_size=2):
    #for x in data_generator_content(texts[508:],sample_len=400,batch_size=2):
        n_iter += 1
        ret = gpt.train_step(
        (
            tf.constant(x[:,:-1]),
            tf.constant(x[:,1:]))
        )
        writer.add_scalar("train/loss", ret["loss"].numpy(), n_iter)
        train_loss = ret["loss"].numpy()
        if n_iter % 10 == 0:
            valid_x = valid_gen.__next__()
            ret = gpt.eval_step(
            (
                tf.constant(valid_x[:,:-1]),
                tf.constant(valid_x[:,1:]))
            )
            writer.add_scalar("test/loss", ret["loss"].numpy(), n_iter)
            val_loss = ret["loss"].numpy()
        pb.info = f"tl: {train_loss} vl: {val_loss}"
        pb.complete(1)
    print()

Epoch 1
tl: 2.87890625 vl: 2.939453125 1.48 % [>--------------------------------------------------] 148/10000 	 used:167s eta:11164 ss

In [None]:
print(stories[0].to_dungeon_format())

# 使用

In [None]:
q = f'''你是一个公司老总，你事业正在上升期，你娶了一个美丽的妻子
> 你走进你的家门
'''
ids = cbpe.encode(q)
#print(ids)
#print("+" * 20)
for i in range(200):
    output = gpt(tf.constant([ids]))
    nid = np.argmax(output[0, -1])
    ids += [nid]
    print(i)
    
print(i,cbpe.decode(ids))

In [None]:
gpt.save_weights('./gpt_weight_pretrain/weight_fp16_200epoch_stories')