In [None]:
VALIDATION_SIZE = 0.3
MAX_TOKENS = 4096
CHUNK_SIZE = 16 #  English sentence average sentence legth: 15~20 / Chinese sentence: 8~14 
LATENT_SIZE = 300
BB_RANDOM_RATIO = 0.3
BATCH_SIZE = 4
THRESHOLD = 0.05

In [None]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow as tf

# import tensorflow_text as tf_text
import pandas as pd

from tqdm import tqdm

This code used the [news commentary dataset](https://opus.nlpl.eu/News-Commentary.php). 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/HAN

Mounted at /content/drive
/content/drive/MyDrive/HAN


Run preprocess.ipynb and get these files for loading.

In [None]:
train_batches = tf.data.Dataset.load('ZH_EN-train_batch-300-split_digits-new', compression = 'GZIP')
val_batches= tf.data.Dataset.load('ZH_EN-val_batch-300-split_digits-new', compression = 'GZIP')

Positional encoding layer as described in the Transformer paper

In [None]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)
  
  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.pos_encoding = positional_encoding(length=MAX_TOKENS, depth=d_model) #maybe need longer length


  def call(self, x):
    length = tf.shape(x)[1]
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x


In [None]:
class AutoEncoder(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.cnn = tf.keras.layers.Conv1D(filters = LATENT_SIZE, kernel_size=CHUNK_SIZE, strides=int(CHUNK_SIZE // 2))
  def call(self, x):
    # tf.print(x.shape)
    x = self.cnn(x)
    return x

In [None]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [None]:
class CrossAttention(BaseAttention):
  def call(self, x, context, BB_mask = None):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True,
        attention_mask = BB_mask)
   
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [None]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x, BB_mask = None):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        attention_mask = BB_mask)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x


In [None]:
class HierarchicalAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.input_attn = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.context_attn = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

  def call(self, x, context):
    # Compute attention score between word vectors
    attn_output = self.input_attn(
        query=x,
        value=x,
        key=x)
    # Compute attention scores between word vectors and context information
    context_attn_output = self.context_attn(
        query = x,
        key = context,
        value = context
    )
    x = self.add([x, attn_output, context_attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, BB_mask):
    x = self.self_attention(x, BB_mask)
    x = self.ffn(x)
    return x

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)
    
    self.auto_encoder = AutoEncoder()

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    
    self.han_layer = HierarchicalAttention(num_heads = num_heads, key_dim = d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    context_info = self.auto_encoder(x)
    x = self.pos_embedding(x) 
    
    x = self.dropout(x)

    # random attention
    BB_mask = tf.random.uniform(shape = [tf.subtract(tf.shape(context_info)[1], 2), tf.subtract(tf.shape(context_info)[1], 2)]) < BB_RANDOM_RATIO
    # window attention
    BB_mask = tf.linalg.set_diag(BB_mask, tf.fill([tf.subtract(tf.shape(context_info)[1], 2)], True), 'set')
    BB_mask = tf.linalg.set_diag(BB_mask, tf.fill([tf.subtract(tf.shape(context_info)[1], 3)], True), 'set2', 1)
    # global attention
    BB_mask = tf.concat([tf.fill([tf.subtract(tf.shape(context_info)[1], 2), 2], True), BB_mask], axis = 1)
    BB_mask = tf.concat([tf.fill([2, tf.shape(context_info)[1]], True), BB_mask], axis = 0)
    # duplicate the mask for all the data in the batch
    BB_mask = tf.repeat([BB_mask], repeats = tf.shape(context_info)[0], axis = 0)
    for i in range(self.num_layers):
      context_info = self.enc_layers[i](context_info, BB_mask)
    
    x = self.han_layer(x, context_info)

    return x, context_info  

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)
    
    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context, BB_mask):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context, BB_mask = BB_mask)

    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  
    return x

In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.han_layer = HierarchicalAttention(num_heads = num_heads, key_dim = d_model)

    self.last_attn_scores = None

  def call(self, x, word_context, sent_context):
    # `x` is token-IDs shape (batch, target_seq_len)
    # `word_context` is word context information
    # `sent_context` is sentence context info
    x = self.pos_embedding(x) 

    x = self.dropout(x)
    
    BB_mask = tf.random.uniform(
          shape = [
              tf.subtract(tf.shape(x)[1], 2), 
              tf.subtract(tf.shape(word_context)[1], 2)
            ]
        ) < BB_RANDOM_RATIO
    if(tf.shape(x)[1] >=  tf.shape(word_context)[1]):
      BB_mask = tf.linalg.set_diag(BB_mask, tf.fill([tf.subtract(tf.shape(word_context)[1], 2)], True), 'set')
      BB_mask = tf.linalg.set_diag(BB_mask, tf.fill([tf.subtract(tf.shape(word_context)[1], 3)], True), 'set2', 1)
    else:
      BB_mask = tf.linalg.set_diag(BB_mask, tf.fill([tf.subtract(tf.shape(x)[1], 2)], True), 'set')
      BB_mask = tf.linalg.set_diag(BB_mask, tf.fill([tf.subtract(tf.shape(x)[1], 2)], True), 'set2', 1)
    BB_mask = tf.concat([tf.fill([tf.subtract(tf.shape(x)[1], 2), 2], True), BB_mask], axis = 1)
    BB_mask = tf.concat([tf.fill([2, tf.shape(word_context)[1]], True), BB_mask], axis = 0)
    BB_mask = tf.repeat([BB_mask], repeats = tf.shape(x)[0], axis = 0)
    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, word_context, BB_mask)
    
    x = self.han_layer(x, sent_context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    return x

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)
  def call(self, inputs):
    context, x  = inputs
    context, sent_context = self.encoder(context)  

    x = self.decoder(x = x, word_context = context, sent_context = sent_context)  
    
    return x

In [None]:
def masked_loss(label, pred):
  loss_object = tf.keras.losses.MeanSquaredError()
  loss = loss_object(label, pred)

  mask = label != 0
  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask
  loss = tf.reduce_sum(loss)
  return loss
THRESHHOLD = 0.1

def masked_accuracy(label, pred):
  label = tf.cast(label, pred.dtype)
  match = tf.abs(label - pred) < THRESHHOLD

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
num_layers = 2
d_model = LATENT_SIZE
dff = 2048
num_heads = 4
dropout_rate = 0.1
reloaded = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=63008,
    target_vocab_size=138655,
    dropout_rate=dropout_rate)

In [None]:
reloaded.load_weights('./bb-zh_en-model_04-12-2023_0949/weights')

In [None]:
import pickle
with open('zh_vec_to_text.300.pkl', 'rb') as f:
  zh_vec_to_text  = pickle.load(f)
from scipy.spatial import KDTree
zh_tree = KDTree(list(zh_vec_to_text.keys()))
with open('zh_str_dict.300-split_digit.pkl', 'rb') as f:
  zh_vec_str_dict = pickle.load(f)

def zh_devectorize(vectors):
  output_sentence = ''
  for vector in vectors:
    arr = zh_tree.data[zh_tree.query(vector)[1]]
    k = '['
    for val in arr:
      k += f'{val:.10f}, '
    k = k[:-1] + ']'
    output_sentence += zh_vec_str_dict[k]
  return output_sentence

In [None]:
import pickle
with open('en_vec_to_text.300-split_digits.pkl', 'rb') as f:
  en_vec_to_text  = pickle.load(f)
from scipy.spatial import KDTree
en_tree = KDTree(list(en_vec_to_text.keys()))

with open('en_str_dict.300-split_digit.pkl', 'rb') as f:
  en_vec_str_dict = pickle.load(f)

def en_devectorize(vectors):
  output_sentence = ''
  for vector in vectors:
    arr = en_tree.data[en_tree.query(vector)[1]]
    k = '['
    for val in arr:
      k += f'{val:.10f}, '
    k = k[:-1] + ']'
    output_sentence += en_vec_str_dict[k]
    output_sentence += ' '
  return output_sentence.split('\n \n')[0]

In [None]:
for (zh, en), _ in val_batches.take(1):
  pass

In [None]:
result = reloaded([tf.expand_dims(zh[0], axis = 0), START], training=False).numpy()[0]

In [None]:
zh_devectorize(zh[0])

'[START]柏林——2008年爆发的全球金融和经济危机是自大萧条以来最严峻的一次经济压力测试，也是自二战以来社会和政治制度所面临的最严重挑战。它不仅对金融市场和货币构成威胁；而且还暴露了迄今为止都无法完全解决的严重的监管和治理缺陷。\n事实上，2008年危机极有可能被视为一座分水岭，但却并非因为它导致了强化经济弹性和消除经济弱点的改革而永久留在人们的记忆当中。相反，领导人未能汲取大萧条的教训，更不用说为此采取相应的预防对策可能引发未来几十年一系列新的经济和其他危机。\n无论这些危机有多严重，一个世纪后的历史学家都极有可能绝望于我们的短视。他们将会看到，分析人士和监管机构通过强化国家监管机制，仅仅是狭隘地专注于修复金融体系。尽管这一目标并非全无价值，但就像历史学家们所指出的那样，这绝不是唯一一件必须要做的事。\n为使世界能够以确保可持续及平衡增长的方式来应对全球化和技术进步所带来的挑战，就必须对国内和国际两级治理机构和制度进行大规模升级。但目前这方面的投入还远远不够。除欧盟等地区机构外，国际金融治理机构基本仍未受到波及。\n更糟的是，因为部分修复金融体系将会带来进一步全球化，这些举措最终会恶化现有问题，因为此举不仅在金融、而且在其他经济和技术领域增加了对本已欠缺的治理和监管框架的压力。此外，专注于提高回报率的巨额金融投资很有可能会推动技术创新，并由此进一步加大对金融和其他监管体系所造成的压力。\n廉价资金推动的重大技术创新可以令市场变化速度快到政策和机构变化均无法适应。同时新市场的出现可以为早期进入者或投资者带来巨大的回报，并使他们可以持续受益于相对国内及国际监管机构的领先。\n这恰恰符合2008年危机爆发之前的情况。新技术支持的金融工具为某些人赚取巨额利润创造了机遇。但监管机构却无法跟上创新的步伐，并最终酿成了影响整体经济的风险。\n这体现出21世纪的全球危机与20世纪30年代大萧条或过去任何一次股市崩盘之间的根本区别。金融行业持续增长导致更多参与主体从短期监管不足和治理薄弱中获益，从而使人们更加难以预防现在的危机。\n令问题更加复杂的是，受当前危机影响的系统远远超过任何一个监管机构的监管范围。这导致危机变得更加凶险，并导致人们更加难以对危机所产生后果——包括社会和政治领域的长期后果——进行预判。\n下一次危机——因为民族主义情绪抬头和人们越来越无视基于科学和事

In [None]:
en_devectorize(result)

'amsterdam – the global financial and economic crisis that began in 2 0 0 8 was the greatest economic stress - test since the great depression , and the greatest challenge to social and political systems since world war ii . it not only put financial markets and currencies at risk ; it also exposed serious icsr and institutionalization shortcomings that have yet to be fully addressed . koç in fact , the 2 0 0 8 crisis will most likely be remembered as a icsr moment , but not because it led to reforms that strengthened economic resilience and subsequently rmb1 . on the notwithstanding , leaders ’ failure to discern , much less act on , the lessons of the great recession may open the way for a series of fresh crises , economic and unfortunately , in the coming decades . koç however nevertheless those crises turn out to be , historians a century from now will likely despair at our shortsightedness . they will note that commentators and regulators were consequently focused on correcting th

In [None]:
en_devectorize(en[0])

'[start] berlin – the global financial and economic crisis that began in 2 0 0 8 was the greatest economic stress - test since the great depression , and the greatest challenge to social and political systems since world war ii . it not only put financial markets and currencies at risk ; it also exposed serious regulatory and governance shortcomings that have yet to be fully addressed . \n in fact , the 2 0 0 8 crisis will most likely be remembered as a watershed moment , but not because it led to reforms that strengthened economic resilience and removed vulnerabilities . on the contrary , leaders ’ failure to discern , much less act on , the lessons of the great recession may open the way for a series of fresh crises , economic and otherwise , in the coming decades . \n however serious those crises turn out to be , historians a century from now will likely despair at our shortsightedness . they will note that analysts and regulators were narrowly focused on fixing the financial system