In [3]:
%load_ext autoreload
%autoreload 2
import re
import numpy as np
import tensorflow as tf
import os
import sys
from tf2gpt.model import GPT
from utils.story_util import Story,Stories
from utils.progress_bar import ProgressBar
from tensorboardX import SummaryWriter
from tensorflow.keras.utils import multi_gpu_model
import random

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [6]:
tf.keras.backend.set_floatx('float16')

In [7]:
#mirrored_strategy = tf.distribute.MirroredStrategy()

In [8]:
#with mirrored_strategy.scope():
gpt = GPT(
    vocab_size=30_000,
    layer_size=32,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=2560,
    num_attention_heads=32,
    attention_dropout=0.0,
    residual_dropout=0.0,
    train_size=499
)
gpt.load_weights('./gpt_weight_pretrain/weight_fp16')

#input_x = tf.keras.layers.Input((499,), dtype=tf.int32)
#outputs = gpt_origin(input_x)

#gpt = tf.keras.Model(inputs=input_x, outputs=outputs)
#gpt = multi_gpu_model(gpt, gpus=8)

print(tf.keras.backend.floatx(), tf.float16, tf.keras.backend.floatx() == tf.float16)
if tf.keras.backend.floatx() == tf.float16:
    for x in gpt.weights:
        assert x.dtype == tf.float16


gpt.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),  # Optimizer
    # Loss function to minimize
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    # List of metrics to monitor
)

float16 <dtype: 'float16'> True


In [9]:
from gpt2_tokenizer import GPT2Tokenizer
cbpe = GPT2Tokenizer(
    'CPM-Generate/bpe_3w_new/vocab.json',
    'CPM-Generate/bpe_3w_new/merges.txt',
    model_file='CPM-Generate/bpe_3w_new/chinese_vocab.model')

In [10]:
ids = cbpe.encode('今天天气还行')
ids

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.571 seconds.
Prefix dict has been built successfully.


[837, 259, 497, 57, 8, 237]

In [11]:
def test_basic_logic():
    ids = cbpe.encode('今天天气还行')
    print(ids)
    print("+" * 20)
    for i in range(10):
        output = gpt(tf.constant([ids]))
        print(output[0].shape)
        nid = np.argmax(output[0, -1])
        ids += [nid]
        print(i, cbpe.decode(ids))
        print(np.argmax(output[0],axis=-1))
        print(cbpe.decode(np.argmax(output[0],axis=-1)))
        print('-' * 30)
test_basic_logic()

[837, 259, 497, 57, 8, 237]
++++++++++++++++++++
(6, 30000)
0 今天天气 还 行 
[  8 497 788 788 237   8]
气 不错 不错行 
------------------------------
(7, 30000)
1 今天天气 还 行 ,
[  8 497 788 788 237   8   9]
气 不错 不错行 ,
------------------------------
(8, 30000)
2 今天天气 还 行 , 我
[  8 497 788 788 237   8   9  16]
气 不错 不错行 , 我
------------------------------
(9, 30000)
3 今天天气 还 行 , 我 就
[  8 497 788 788 237   8   9  16  29]
气 不错 不错行 , 我 就
------------------------------
(10, 30000)
4 今天天气 还 行 , 我 就 想
[  8 497 788 788 237   8   9  16  29  84]
气 不错 不错行 , 我 就 想
------------------------------
(11, 30000)
5 今天天气 还 行 , 我 就 想着
[  8 497 788 788 237   8   9  16  29  84 197]
气 不错 不错行 , 我 就 想着
------------------------------
(12, 30000)
6 今天天气 还 行 , 我 就 想着 去
[  8 497 788 788 237   8   9  16  29  84 197  91]
气 不错 不错行 , 我 就 想着 去
------------------------------
(13, 30000)
7 今天天气 还 行 , 我 就 想着 去 看看
[  8 497 788 788 237   8   9  16  29  84 197  91 881]
气 不错 不错行 , 我 就 想着 去 看看
------------------------------
(14, 30000)
8 今天天气 还 

In [12]:
def get_learning_rate(learning_rate=6e-4,
                      warmup_steps=20_0000,
                      decay_steps=200_0000,
                      alpha=0.0):
    def decayed_learning_rate(step=1):
        if step <= warmup_steps:
            mult = step / float(warmup_steps)
        else:
            progress = (step - warmup_steps) / (decay_steps - warmup_steps)
            mult = 0.5 * (1 + math.cos(math.pi * progress))
            mult = max(0.1, mult)
        return learning_rate * mult
    return decayed_learning_rate

In [13]:
stories = Stories("./labeled_data/advanture_translated/processed_translated_story.txt").stories
#stories = stories[:50]

In [14]:
data_folder = "./labeled_data/"
txt_files = [(data_folder + i) for i in os.listdir(data_folder) if "txt" in i]
#stories = stories[:10]
stories += [Story("","").from_file(i) for i in txt_files]

In [15]:
len(stories)

308

In [16]:
import copy
def data_generator(stories, batch_size=4,sample_len=200):
    batch_data = []
    tmp_stories = copy.copy(stories)
    random.shuffle(tmp_stories)
    pb = ProgressBar(len(tmp_stories))
    pb.startjob()
    for i,one_story in enumerate(tmp_stories):
        story_content = one_story.to_normal_format()
        story_content = story_content.replace("<start>\n","")
        story_content = story_content.replace("\n<end>","")
        story_content = story_content.replace("\n<end>","")
        story_content = story_content.replace(" ","")
        ids = cbpe.encode(story_content)
        while ids:
            sample = ids[:sample_len]
            ids = ids[sample_len:]
            if len(sample) < sample_len:
                sample += [0 for i in range((sample_len - len(sample)))]
            batch_data.append(sample)
            if len(batch_data) >= batch_size:
                yield np.asarray(batch_data)
                batch_data = []
        pb.complete(1)

In [17]:
i = 0
for x in data_generator(stories,sample_len=400):
    print(x.shape)
    i += 1
    if i > 20:
        break

(4, 400)
 0.32 % [>--------------------------------------------------] 1/308 	 used:0s eta:4 s(4, 400)
 0.65 % [>--------------------------------------------------] 2/308 	 used:0s eta:3 s(4, 400)
(4, 400)
 1.30 % [>--------------------------------------------------] 4/308 	 used:0s eta:3 s(4, 400)
 1.62 % [>--------------------------------------------------] 5/308 	 used:0s eta:2 s(4, 400)
 1.95 % [>--------------------------------------------------] 6/308 	 used:0s eta:2 s(4, 400)
 2.27 % [=>-------------------------------------------------] 7/308 	 used:0s eta:2 s(4, 400)
(4, 400)
(4, 400)
 2.60 % [=>-------------------------------------------------] 8/308 	 used:0s eta:3 s(4, 400)
 2.92 % [=>-------------------------------------------------] 9/308 	 used:0s eta:3 s(4, 400)
(4, 400)
(4, 400)
 3.57 % [=>-------------------------------------------------] 11/308 	 used:0s eta:3 s(4, 400)
(4, 400)
(4, 400)
 3.90 % [=>-------------------------------------------------] 12/308 	 used:0s et

In [None]:
writer = SummaryWriter('log/finetune')
n_iter = 0
for epoch in range(20):
    print(f"Epoch {epoch + 1}")
    iter = 0
    for x in data_generator(stories,sample_len=500,batch_size=2):
    #for x in data_generator_content(texts[508:],sample_len=400,batch_size=2):
        n_iter += 1
        ret = gpt.train_step(
        (
            tf.constant(x[:,:-1]),
            tf.constant(x[:,1:]))
        )
        writer.add_scalar("train/loss", ret["loss"].numpy(), n_iter)
        if iter % 100 == 0:
            test_basic_logic()
            print("ret",ret)
        iter += 1
    print()

In [19]:
for epoch in range(200):
    print(f"Epoch {epoch + 1}")
    iter = 0
    for x in data_generator(stories,sample_len=500,batch_size=2):
    #for x in data_generator_content(texts[508:],sample_len=400,batch_size=2):
        n_iter += 1
        ret = gpt.train_step(
        (
            tf.constant(x[:,:-1]),
            tf.constant(x[:,1:]))
        )
        writer.add_scalar("train/loss", ret["loss"].numpy(), n_iter)
        if iter % 100 == 0:
            test_basic_logic()
            print("ret",ret)
        iter += 1
    print()



KeyboardInterrupt: 

# 使用

In [32]:
q = f'''填好资料后，戴辛夷开始办理操作业务。    你朝门外看了看，那个骗子正在营业厅外等待着。    他不敢进来，因为银行里有监控'''
ids = cbpe.encode(q)
#print(ids)
#print("+" * 20)
for i in range(80):
    output = gpt(tf.constant([ids]))
    nid = np.argmax(output[0, -1])
    ids += [nid]
    print(i)
    
print(i,cbpe.decode(ids))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
79 填好 资料 后 , 戴 辛夷 开始 办理 操作 业务 。         你 对 她 说 “ 要 不要 到 我家 , 我们 探讨 一下 生理 问题 ? ” “ 好 , 好 , 我会 考虑 的 。 ” “ 好 , 我会 考虑 的 。 ” “ 好 , 我会 考虑 的 。 ” “ 好 , 我会 考虑 的 。 ” “ 好 , 我会 考虑 的 。 ” “ 好 , 我会 考虑 的 。 
