<a href="https://colab.research.google.com/github/alpacaYiChun/ML/blob/master/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import MultiHeadAttention, Dense, Embedding, Dropout, LayerNormalization, Input, GlobalAveragePooling1D, Bidirectional, LSTM, Lambda, Concatenate
from tensorflow.keras.models import Model
import pandas as pd
from keras.callbacks import LearningRateScheduler

In [None]:
MAX_LEN = 200
NUM_HEAD = 4
WORD_EMBED_DIM = 32
POS_EMBED_DIM = 32
F1 = 64
BLOCKS = 2
VOCAB_SIZE = 20000  # Only consider the top 20k words

In [None]:
(x_train, y_train), (x_val, y_val) = tf.keras.datasets.imdb.load_data(num_words=VOCAB_SIZE)
print(x_train.shape, "Training sequences")
print(x_val.shape, "Validation sequences")
print(x_train[:5])
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=MAX_LEN, padding='post')
x_val = tf.keras.preprocessing.sequence.pad_sequences(x_val, maxlen=MAX_LEN, padding='post')

(25000,) Training sequences
(25000,) Validation sequences
[list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 534

In [None]:
def get_dicts():
  word_to_id = tf.keras.datasets.imdb.get_word_index()
  wti = {}
  itw = {}

  wti["<PAD>"] = 0
  wti["<START>"] = 1
  wti["<UNK>"] = 2
  wti["<UNUSED>"] = 3

  itw[0] = "<PAD>"
  itw[1] = "<CLS>"
  itw[2] = "<UNK>"
  itw[3] = "<UNUSED>"

  for k,v in word_to_id.items():
    wti[k] = v+3
    itw[v+3] = k
  return wti, itw

wti, itw = get_dicts()

def vector_to_sentence(vector):
  return " ".join([itw[v] for v in vector])

def sentence_to_vector(str):
  words = str.split()
  return np.array([wti[w] for w in words])

In [None]:
def lr_decay(epoch):
  lr_base = 0.0001
  decay = pow(0.5, epoch)
  return lr_base * decay

scheduler = LearningRateScheduler(lr_decay)

def lstm(x):
  y = x
  y = Bidirectional(LSTM(8))(y)
  return y

def transform(x):
  y = x
  y = MultiHeadAttention(NUM_HEAD, x.shape[-1])(y, y)
  y = Dropout(0.25)(y)
  a = LayerNormalization(epsilon=1e-6)(x + y)
  y = Dense(F1, activation='relu')(a)
  y = Dense(x.shape[-1])(y)
  y = Dropout(0.25)(y)
  y = LayerNormalization(epsilon=1e-6)(a + y)
  return y

def embed(x, vocab, max_len):
  word_embed = Embedding(vocab, WORD_EMBED_DIM)(x)

  all_pos = tf.range(start=0, limit=max_len, delta=1)
  pos_embed = Embedding(max_len, POS_EMBED_DIM)(all_pos)

  b_expanded = tf.expand_dims(pos_embed, axis=0)
  b_tiled = tf.tile(b_expanded, [tf.shape(word_embed)[0], 1, 1])
  
  return tf.concat([word_embed, b_tiled], axis=-1)

def transform_train():
  input = Input((MAX_LEN,))
  embeded = embed(input, VOCAB_SIZE, MAX_LEN)
  transformed = embeded
  for i in range(BLOCKS):
    transformed = transform(transformed)

  ose = GlobalAveragePooling1D()(transformed)
  ose = Dropout(0.25)(ose)
  ose = Dense(2, activation='softmax')(ose)

  model = Model(input, ose)
  model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()

  return model

def tokens_summary(tokens):
  ose = GlobalAveragePooling1D()(tokens)
  ose = Dropout(0.25)(ose)
  ose = Dense(2, activation='softmax')(ose)
  return ose  

def lstm_train(max_len, vocab, word_embed_dim):
  input = Input((max_len,), name="input")
  embeded = Embedding(vocab, word_embed_dim, name="embed")(input)
  lstms = Bidirectional(LSTM(word_embed_dim, dropout=0.25, recurrent_dropout=0.25), name="lstm")(embeded)
  output = Dense(2, activation='softmax', name="classify_output")(lstms)

  model = Model(input, output)
  model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], callbacks=[scheduler])
  model.summary()

  return model


#model = lstm_train(MAX_LEN, VOCAB_SIZE, WORD_EMBED_DIM)
model = transform_train()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 200)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 200, 32)      640000      ['input_1[0][0]']                
                                                                                                  
 tf.compat.v1.shape (TFOpLambda  (3,)                0           ['embedding[0][0]']              
 )                                                                                                
                                                                                                  
 tf.__operators__.getitem (Slic  ()                  0           ['tf.compat.v1.shape[0][0]'] 

In [None]:

model.fit(x_train, y_train, epochs=2, batch_size=32, validation_data=(x_val, y_val))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f99caf23730>

In [None]:
model.isTraining = False
model.evaluate(x_val, y_val)



[0.3386276960372925, 0.8663600087165833]

In [None]:
sentence = "i really was driven to the brink of falling asleep by this book"
s = "this book illustrates how we can make a cake but is unable to talk about how to keep it overall does not solve the problem what a garbage"
s = "from the beginning to the end there is virtually no clue what it is going to talk about"
s = "i have no idea who will waste time to watch such a movie with no story at all and the words are awfully made up"
s = "if you continue to make trouble i will beat you"
s = "you are such a horrible man that can make trouble everywhere"
s = "one plus one is two"
v = sentence_to_vector(s)
v = np.expand_dims(v, 0)
v = tf.keras.preprocessing.sequence.pad_sequences(v, maxlen=MAX_LEN)
print(v)
result = model.predict(v)
print(result)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  31 935  31
    9 107]]
[[0.34911996 0.65088004]]


In [None]:
def generator(model):
  input = Input((MAX_LEN,))
  temp_model = Model(model.input, model.layers[-4].output)
  temp_output = temp_model(input)
  
  #lstm_output = LSTM(WORD_EMBED_DIM, dropout=0.25, recurrent_dropout=0.25)(temp_output)

  lstm_output = transform(temp_output)
  lstm_output = GlobalAveragePooling1D()(lstm_output)
  lstm_output = Dropout(0.25)(lstm_output)
  
  next_output = Dense(VOCAB_SIZE, activation='softmax')(lstm_output)
  next_model = Model(input, next_output)
  next_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

  #for layer in next_model.layers:
   # layer.trainable = False
  next_model.layers[0].trainable = False

  print(next_model.summary())

  return next_model

next_model = generator(model)
for layer in next_model.layers:
  print(layer.trainable)

x_gen_train = []
y_gen_train = []
def go(wlist):
  sublist = np.zeros((MAX_LEN,))
  for i in range(0, 11):
    sublist[i] = wlist[i]
  for i in range(11, MAX_LEN):
    sublist[i] = 0
  x_gen_train.append(sublist)
  y_gen_train.append(wlist[11])

for i in range(8000):
  go(x_train[i])

x_gen_train = np.array(x_gen_train)
y_gen_train = np.array(y_gen_train)
print(x_gen_train.shape)
print(y_gen_train.shape)

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 200)]        0           []                               
                                                                                                  
 model_7 (Functional)           (None, 200, 64)      789888      ['input_5[0][0]']                
                                                                                                  
 multi_head_attention_5 (MultiH  (None, 200, 64)     66368       ['model_7[0][0]',                
 eadAttention)                                                    'model_7[0][0]']                
                                                                                                  
 dropout_11 (Dropout)           (None, 200, 64)      0           ['multi_head_attention_5[0]

In [None]:
next_model.fit(x_gen_train, y_gen_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fddcd8bb1f0>

In [None]:
def start(str):
  words = str.split()
  pwd = [wti[w] for w in words]

  while len(pwd) < 20:
    v = np.expand_dims(pwd, 0)
    print(v)
    v = tf.keras.preprocessing.sequence.pad_sequences(v, maxlen=MAX_LEN)
    next_distribute = next_model.predict(v)[0]
    next_index = np.random.choice(len(next_distribute), p=next_distribute)
    #next_index = np.argmax(next_distribute)
    pwd.append(next_index)

  return pwd

got = [itw[i] for i in start('this movie is full of garbage and does not make')]
print(got)


[[  14   20    9  368    7 1244    5  127   24   97]]
[[  14   20    9  368    7 1244    5  127   24   97   47]]
[[   14    20     9   368     7  1244     5   127    24    97    47 16519]]
[[   14    20     9   368     7  1244     5   127    24    97    47 16519
    259]]
[[   14    20     9   368     7  1244     5   127    24    97    47 16519
    259   140]]
[[   14    20     9   368     7  1244     5   127    24    97    47 16519
    259   140    78]]
[[   14    20     9   368     7  1244     5   127    24    97    47 16519
    259   140    78    15]]
[[   14    20     9   368     7  1244     5   127    24    97    47 16519
    259   140    78    15    28]]
[[   14    20     9   368     7  1244     5   127    24    97    47 16519
    259   140    78    15    28   823]]
[[   14    20     9   368     7  1244     5   127    24    97    47 16519
    259   140    78    15    28   823    68]]
['this', 'movie', 'is', 'full', 'of', 'garbage', 'and', 'does', 'not', 'make', 'has', 'coccio', '