In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
import tensorflow as tf
import glob
from model_gpt import Gpt
from data_helpers import *

In [4]:
tf.test.is_gpu_available()

True

In [5]:
mirrored_strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])
BATCH_SIZE_PER_REPLICA = 8
global_batch_size = (BATCH_SIZE_PER_REPLICA *
                     mirrored_strategy.num_replicas_in_sync)

In [6]:
global_batch_size

16

In [7]:
with mirrored_strategy.scope():
    model = tf.keras.Sequential([tf.keras.layers.Dense(2, input_shape=(3,))])
    optimizer = tf.keras.optimizers.SGD()

In [8]:
dataset = tf.data.Dataset.from_tensors(([1., 2., 3.,], [1., 0.])).repeat(1000).batch(
    global_batch_size)
dist_dataset = mirrored_strategy.experimental_distribute_dataset(dataset)

In [11]:
#@tf.function
def train_step(dist_inputs):
    def step_fn(inputs):
        features, labels = inputs

        with tf.GradientTape() as tape:
            logits = model(features)
            #print(logits)
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                        logits=logits, labels=labels)
            loss = tf.reduce_sum(cross_entropy) * (1.0 / global_batch_size)

        print(cross_entropy)
        print("\n")
        print(loss)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(list(zip(grads, model.trainable_variables)))
        return cross_entropy

    per_example_losses = mirrored_strategy.experimental_run_v2(
            step_fn, args=(dist_inputs,))
    mean_loss = mirrored_strategy.reduce(
            tf.compat.v2.distribute.ReduceOp.MEAN, per_example_losses, axis=0)
    return mean_loss

In [12]:
with mirrored_strategy.scope():
    for inputs in dist_dataset:
        train_step(inputs)

tf.Tensor(
[0.05954077 0.05954077 0.05954077 0.05954077 0.05954077 0.05954077
 0.05954077 0.05954077], shape=(8,), dtype=float32)


tf.Tensor(0.029770385, shape=(), dtype=float32)
tf.Tensor(
[0.05954077 0.05954077 0.05954077 0.05954077 0.05954077 0.05954077
 0.05954077 0.05954077], shape=(8,), dtype=float32)


tf.Tensor(0.029770385, shape=(), dtype=float32)


ValueError: A non-DistributedValues value 8 cannot be reduced with the given reduce op ReduceOp.SUM.

In [5]:
#Load DataSet
tf_records = "/data/tf_transformer_jd_data/*.tfrecord"
tf_records = glob.glob(tf_records)

In [6]:
dataset = tf_batch_iterator(tf_records, batch_size=8, static_batch=True)

In [7]:
#import numpy as np
#dataset = np.random.randint(500, size=(1000, 200))


In [8]:
model = Gpt(8, 768, 8, 3072, 512, 50000,
                 optimizer="adam", learning_rate=0.001)

model.creat_optimizer()
model.create_checkpoint_manager("../log")
model.create_summary_writer("../log")

Initializing model from scratch.........


(<tensorflow.python.ops.summary_ops_v2.ResourceSummaryWriter at 0x7f9768119128>,
 <tensorflow.python.ops.summary_ops_v2.ResourceSummaryWriter at 0x7f96f805b2b0>)

In [None]:
model.fit(dataset)

1566397335.863161
1.2475545406341553
Step 0 Train_Loss 6.7555 Train_Accuracy 0.0000
0.07060694694519043
0.6756556034088135
0.0006010532379150391
0.5170323848724365
0.000591278076171875
0.39098453521728516
0.0006031990051269531
0.4092681407928467
0.0005300045013427734
0.5804312229156494
0.0007128715515136719
0.5376935005187988
0.0006616115570068359
0.5838406085968018
0.00067901611328125
0.5222241878509521
0.00025582313537597656
0.5532712936401367
0.00027680397033691406
0.5262517929077148
Step 10 Train_Loss 2.9714 Train_Accuracy 0.0742
0.0897216796875
0.532895565032959
0.0007557868957519531
0.4413025379180908
0.0006079673767089844
0.3407862186431885
0.00025844573974609375
0.6690058708190918
0.0005450248718261719
0.5342147350311279
0.0005323886871337891
0.4357268810272217
0.00054931640625
0.6493914127349854
0.0005395412445068359
0.5041649341583252
0.0005397796630859375
0.5119156837463379
0.0008025169372558594
0.4587998390197754
Step 20 Train_Loss 3.4899 Train_Accuracy 0.0493
0.07921195030

In [None]:
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

In [None]:
labels = [[2, 2, 3, 0, 0, 0]]

weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
outputs = [[2, 4, 3, 0, 0, 0]]
padded_labels = tf.cast(labels, tf.int32)

acc = tf.cast(tf.equal(outputs, padded_labels), tf.float32)

nonpad = tf.math.count_nonzero(weights, dtype=tf.dtypes.float32,)

In [None]:
nonpad

In [None]:
weights

In [None]:
acc

In [None]:
m = tf.keras.metrics.Mean()
m.update_state(acc, weights)

In [None]:
acc*weights

In [None]:
accuracy = tf.reduce_sum(tf.cast(acc*weights, tf.float32))/nonpad

In [None]:
accuracy

In [None]:
train_accuracy(labels, [[[1, 4, 6], [1, 5, 6], [1, 0, 4], [0, 0, 0], [0, 0, 0], [0, 0, 0]]])

In [None]:
tf.argmax([[[1, 4, 2], [1, 5, 2], [1, 0, 4], [0, 0, 0], [0, 0, 0], [0, 0, 0]]], axis=2)

In [None]:
def shape_list(x):
    """Deal with dynamic shape in tensorflow cleanly."""
    static = x.shape.as_list()
    dynamic = tf.shape(x)
    return [dynamic[i] if s is None else s for i, s in enumerate(static)]

def create_look_ahead_mask(nd, ns):
    mask = 1-tf.linalg.band_part(tf.ones((nd, ns)), -1, 0)
    return mask  # (seq_len, seq_len)

def attention_mask(nd, ns, *, dtype):
    """1's in the lower triangle, counting from the lower right corner.
    Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
    """
    i = tf.range(nd)[:,None]
    j = tf.range(ns)
    m = i >= j - ns + nd
    return tf.cast(m, dtype)

def mask_attn_weights(w):
    # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
    _, _, nd, ns = shape_list(w)
    b = attention_mask(nd, ns, dtype=w.dtype)
    b = tf.reshape(b, [1, 1, nd, ns])
    w = w*b - tf.cast(1e10, w.dtype)*(1-b)
    return w

In [None]:
x = tf.compat.v1.get_variable('x', [1, 2, 4, 4])

In [None]:
mask=mask_attn_weights(x)

In [None]:
mask.shape

In [None]:
import numpy as np

np.array(mask[0][1])

In [None]:
m = attention_mask(4, 4, dtype=tf.float32)

In [None]:
m

In [None]:
create_look_ahead_mask(4, 4)

In [None]:
b = tf.reshape(m, [1, 1, 4, 4])

In [None]:
b.shape

In [None]:
m.shape

In [None]:
x*b

In [None]:
x*b - tf.cast(1e10, x.dtype)*(1-b)

In [None]:
tf.cast(1e10, x.dtype)*(1-b)

In [None]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [None]:
x = tf.compat.v1.get_variable('x', [2, 4])

In [None]:
x = tf.concat([x, tf.zeros([2, 1])], axis=1)

In [None]:
x

In [None]:
lk = create_look_ahead_mask(5, 5)
m = create_padding_mask(x)

In [None]:
lk

In [None]:
m

In [None]:
tf.maximum(m, lk)

In [None]:
x

In [None]:
tf.split(x, 2, axis=0)

In [None]:
def expand_tile(value, size):
    """Add a new axis of given size."""
    value = tf.convert_to_tensor(value, name='value')
    ndims = value.shape.ndims
    return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims)

def positions_for(tokens, past_length):
    batch_size = tf.shape(tokens)[0]
    nsteps = tf.shape(tokens)[1]
    return expand_tile(past_length + tf.range(nsteps), batch_size)

In [None]:
x = tf.compat.v1.get_variable('x', [1, 5])

In [None]:
positions_for(x, 0)

In [None]:
tf.cast(tf.not_equal([[1, 1, 0], [1, 0, 0]], 0), tf.int32)

In [None]:
batch_size = 1
batch_seq = 1
start = 2

In [None]:
tf.reshape(tf.tile(tf.range(start, batch_seq + start), [batch_size]),
                                       [batch_size, batch_seq])