In [None]:
# !git clone https://github.com/google-research/albert.git

In [None]:
# -*- coding: utf-8 -*-
# https://www.kaggle.com/igetii/bert-keras/notebook?select=train.csv

In [1]:
!nvidia-smi

Thu Jun 18 16:47:54 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.48.02    Driver Version: 440.48.02    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 2080    Off  | 00000000:65:00.0 Off |                  N/A |
| 28%   43C    P0    17W / 245W |      0MiB /  7982MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
import os
import sys
sys.path.append('albert')

import numpy as np
import pandas as pd
import datetime
import sys
import zipfile
import modeling
import optimization
import run_classifier
import tokenization

from tokenization import FullTokenizer
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.model_selection import train_test_split

import tensorflow_hub as hub
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model

In [3]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config,)

# Params for albert model and tokenization
# bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
albert_path = 'https://tfhub.dev/google/albert_base/1'
max_seq_length = 128

In [4]:
train_df = pd.read_csv('files/train.csv', index_col='id')
val_df = pd.read_csv('files/valid.csv', index_col='id')
test_df = pd.read_csv('files/test.csv', index_col='id')

In [5]:
train_df.head()

Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Sam has an everlast treat each nite before bed...,dogs
1,The product is as it says. I keep an eye on it...,dogs
2,My Kitty thinks these are treats! He loves the...,dogs
3,This is the third or fourth time that we've or...,dogs
4,Put this on both my dogs. And they are scratch...,dogs


In [6]:
label_encoder = LabelEncoder().fit(pd.concat([train_df['label'], val_df['label']]))

In [7]:
X_train_val, X_pred = pd.concat([train_df['text'], val_df['text']]).values, test_df['text'].values
y_train_val = label_encoder.fit_transform(pd.concat([train_df['label'], val_df['label']]))

X_train, X_val, y_train, y_val = train_test_split(
        X_train_val,y_train_val, test_size=0.2, random_state=0, stratify = y_train_val
        )

X_val, X_test, y_val, y_test = train_test_split(
        X_val,y_val, test_size=0.5, random_state=0
        )


In [8]:
print(len(y_train))
print(len(y_val))
print(len(y_test))
print(len(X_pred))

55528
6941
6941
17353


In [9]:
train_text = X_train

In [10]:
train_text = [' '.join(t.split()[0:max_seq_length]) for t in train_text]

In [11]:
train_text = np.array(train_text, dtype=object)[:, np.newaxis]

In [12]:
train_text = X_train
train_text = [' '.join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = y_train

val_text = X_val
val_text = [' '.join(t.split()[0:max_seq_length]) for t in val_text]
val_text = np.array(val_text, dtype=object)[:, np.newaxis]
val_label = y_val

test_text = X_test
test_text = [' '.join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = y_test

pred_text = X_pred
pred_text = [' '.join(t.split()[0:max_seq_length]) for t in pred_text]
pred_text = np.array(pred_text, dtype=object)[:, np.newaxis]


In [13]:
import tensorflow as tf
import tensorflow_hub as hub
import os
import re
import numpy as np
from tqdm import tqdm_notebook
#from tensorflow.keras import backend as K
from keras import backend as K
from keras.layers import Layer

class AlbertLayer(Layer):
    
    '''AlbertLayer which support next output_representation param:
    
    pooled_output: the first CLS token after adding projection layer () with shape [batch_size, 768]. 
    sequence_output: all tokens output with shape [batch_size, max_length, 768].
    mean_pooling: mean pooling of all tokens output [batch_size, max_length, 768].
    
    
    You can simple fine-tune last n layers in ALBERT with n_fine_tune_layers parameter. For view trainable parameters call model.trainable_weights after creating model.
    
    '''
    
    def __init__(self, n_fine_tune_layers=10, tf_hub = None, output_representation = 'pooled_output', trainable = False, **kwargs):
        
        print('__init__ is called')
        
        self.n_fine_tune_layers = n_fine_tune_layers
        self.is_trainble = trainable
        self.output_size = 768
        self.tf_hub = tf_hub
        self.output_representation = output_representation
        self.supports_masking = True
        
        super(AlbertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        print('build is called')

        self.albert = hub.Module(
            self.tf_hub,
            trainable=self.is_trainble,
            name="{}_module".format(self.name)
        )
        
        
        variables = list(self.albert.variable_map.values())
#         print(variables)
        if self.is_trainble:
            # 1 first remove unused layers
            trainable_vars = [var for var in variables if not "/cls/" in var.name]
#             trainable_vars = [var for var in variables]
            
            
            if self.output_representation == "sequence_output" or self.output_representation == "mean_pooling":
                # 1 first remove unused pooled layers
                trainable_vars = [var for var in trainable_vars if not "/pooler/" in var.name]
                
            # Select how many layers to fine tune
            trainable_vars = trainable_vars[-self.n_fine_tune_layers :]
            
            # Add to trainable weights
            for var in trainable_vars:
                self._trainable_weights.append(var)

            # Add non-trainable weights
            for var in self.albert.variables:
                if var not in self._trainable_weights:
                    self._non_trainable_weights.append(var)
                
        else:
             for var in variables:
                self._non_trainable_weights.append(var)
                

        super(AlbertLayer, self).build(input_shape)

    def call(self, inputs):
        print('call is called')
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        albert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        result = self.albert(inputs=albert_inputs, signature="tokens", as_dict=True)
        
        if self.output_representation == "pooled_output":
            pooled = result["pooled_output"]
            
        elif self.output_representation == "mean_pooling":
            result_tmp = result["sequence_output"]
        
            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result_tmp, input_mask)
            
        elif self.output_representation == "sequence_output":
            
            pooled = result["sequence_output"]
       
        return pooled
    
    def compute_mask(self, inputs, mask=None):
        
        if self.output_representation == 'sequence_output':
            inputs = [K.cast(x, dtype="bool") for x in inputs]
            mask = inputs[1]
            
            return mask
        else:
            return None
        
        
    def compute_output_shape(self, input_shape):
        if self.output_representation == "sequence_output":
            return (input_shape[0][0], input_shape[0][1], self.output_size)
        else:
            return (input_shape[0][0], self.output_size)


Using TensorFlow backend.


In [14]:
import keras

In [31]:
def build_model(max_seq_length, tf_hub, n_classes, n_fine_tune): 
    in_id = keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    albert_inputs = [in_id, in_mask, in_segment]
    
    albert_output = AlbertLayer(n_fine_tune_layers=n_fine_tune, tf_hub = tf_hub, output_representation = 'mean_pooling', trainable = True)(albert_inputs)
    drop = keras.layers.Dropout(0.3)(albert_output)
    dense = keras.layers.Dense(256, activation='sigmoid')(drop)
    drop = keras.layers.Dropout(0.3)(dense)
    dense = keras.layers.Dense(64, activation='sigmoid')(drop)
    pred = keras.layers.Dense(n_classes, activation='softmax')(dense)
    
    model = keras.models.Model(inputs=albert_inputs, outputs=pred)
    Adam = keras.optimizers.Adam(lr = 0.00008)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam, metrics=['sparse_categorical_accuracy'])
    model.summary()

    return model

#     return z_imbd

def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [32]:
n_classes = len(label_encoder.classes_)
print('Num Class : ', n_classes)
n_fine_tune_layers = 48

model = build_model(max_seq_length, albert_path, n_classes, n_fine_tune_layers)

# Instantiate variables
initialize_vars(sess)

Num Class :  6
__init__ is called
build is called
self._spec
call is called
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          (None, 128)          0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        (None, 128)          0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        (None, 128)          0                                            
__________________________________________________________________________________________________
albert_layer_2 (AlbertLayer)    (None, 768)          11812272    input_ids[0][0]                  
                                                                 input_masks[0][0]          

In [33]:
# model.trainable_weights

In [34]:
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def create_tokenizer_from_hub_module(tf_hub):
    """Get the vocab file and casing info from the Hub module."""
    albert_module =  hub.Module(tf_hub)
    tokenization_info = albert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
        vocab_file, do_lower_case = sess.run(
            [
                tokenization_info["vocab_file"],
                tokenization_info["do_lower_case"],
            ]
    )
    
    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, spm_model_file='albert_base_1/assets/30k-clean.model')

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)
    
    #print(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm_notebook(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels).reshape(-1, 1),
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
        )
    return InputExamples

In [35]:
# !tar -zxvf albert_base_1.tar.gz

In [36]:
# # Instantiate tokenizer
# # tokenizer = create_tokenizer_from_hub_module(albert_path)
# tokenizer = FullTokenizer('albert_base_1/30k-clean.vocab', do_lower_case=True )

In [37]:
# print(tokenizer.tokenize("hi I like cat and dog"))

In [38]:
tokenizer = create_tokenizer_from_hub_module(albert_path)
print(tokenizer.tokenize("hi I like cat and dog, nice to meet you"))

self._spec
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:loading sentence piece model


INFO:tensorflow:loading sentence piece model


['▁hi', '▁', 'I', '▁like', '▁cat', '▁and', '▁dog', ',', '▁nice', '▁to', '▁meet', '▁you']


In [39]:
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module(albert_path)
# tokenizer = FullTokenizer('albert_base_1/30k-clean.vocab', do_lower_case=True )
# print('hi')

# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_label)
val_examples = convert_text_to_examples(val_text, val_label)

# Convert to features
(train_input_ids, train_input_masks, train_segment_ids, train_labels 
) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)
(val_input_ids, val_input_masks, val_segment_ids, val_labels
) = convert_examples_to_features(tokenizer, val_examples, max_seq_length=max_seq_length)

self._spec
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:loading sentence piece model


INFO:tensorflow:loading sentence piece model
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='Converting examples to features', max=55528.0, style=Prog…




HBox(children=(FloatProgress(value=0.0, description='Converting examples to features', max=6941.0, style=Progr…




In [40]:
from keras.callbacks import EarlyStopping

BATCH_SIZE = 32
MONITOR = 'val_sparse_categorical_accuracy'
print('BATCH_SIZE is {}'.format(BATCH_SIZE))
e_stopping = EarlyStopping(monitor=MONITOR, patience=1, verbose=1, mode='max', restore_best_weights=True)
callbacks =  [e_stopping]

history = model.fit(
   [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data = ([val_input_ids, val_input_masks, val_segment_ids], val_labels),
    epochs = 10,
    verbose = 1,
    batch_size = BATCH_SIZE,
    callbacks= callbacks
)

BATCH_SIZE is 32


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 55528 samples, validate on 6941 samples
Epoch 1/10
Epoch 2/10
Restoring model weights from the end of the best epoch
Epoch 00002: early stopping


In [41]:
test_examples = convert_text_to_examples(test_text, test_label)

(test_input_ids, test_input_masks, test_segment_ids, test_labels
) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)

model.evaluate([test_input_ids, test_input_masks, test_segment_ids], test_labels, batch_size=32)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='Converting examples to features', max=6941.0, style=Progr…




[0.5982288722688982, 0.7722229957580566]

In [42]:
pred_examples = convert_text_to_examples(pred_text, np.zeros(len(pred_text)))

In [43]:
(pred_input_ids, pred_input_masks, pred_segment_ids, pred_labels
) = convert_examples_to_features(tokenizer, pred_examples, max_seq_length=max_seq_length)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='Converting examples to features', max=17353.0, style=Prog…




In [44]:
prediction = model.predict([pred_input_ids, pred_input_masks, pred_segment_ids], verbose = 1)



In [45]:
preds = label_encoder.classes_[np.argmax(prediction, axis =1)]

In [46]:
preds[0:30]

array(['dogs', 'dogs', 'dogs', 'cats', 'cats', 'cats', 'dogs', 'dogs',
       'dogs', 'dogs', 'dogs', 'cats', 'dogs', 'dogs', 'dogs', 'dogs',
       'dogs', 'fish aquatic pets', 'dogs', 'cats', 'dogs', 'cats',
       'cats', 'dogs', 'cats', 'dogs', 'fish aquatic pets', 'cats',
       'cats', 'cats'], dtype=object)