In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [8]:
!pip -q install emoji pandas transformers[sentencepiece] tensorflow_hub

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from transformers import TFAutoModel
from transformers import BertweetTokenizer
import tensorflow_hub as hub

# BERTweet Embeddings
### BERTweet: A pre-trained language model for English Tweets
https://arxiv.org/abs/2005.10200

In [17]:
bertweet_tokenizer = BertweetTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)
bertweet_model = TFAutoModel.from_pretrained("vinai/bertweet-base")
bertweet_model.trainable=False

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Some layers from the model checkpoint at vinai/bertweet-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at vinai/bertweet-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [2]:
train = pd.read_csv('../data/external/kaggle/train.csv')
test = pd.read_csv('../data/external/kaggle/test.csv')

In [12]:
tokenizer.tokenize('😃😊🥺😉😍😘🥰🇫🇷')

[':grinning_face_with_big_eyes:',
 ':smiling_face_with_smiling_eyes:',
 ':pleading_face:',
 ':winking_face:',
 ':smiling_face_with_heart-eyes:',
 ':face_blowing_a_kiss:',
 ':smiling_face_with_3_hearts:',
 ':regional_indicator_symbol_letter_f:',
 ':regional_indicator_symbol_letter_r:']

In [4]:
def huggingface_embedder(df, col, tokenizer, model, batch_size=512):
    to_tokenize = df[col].values
    tokenized = tokenizer(list(to_tokenize), padding=True, truncation=True, return_tensors='tf', return_token_type_ids=False)
    inputs = tokenized['input_ids']
    masks = tokenized['attention_mask']
    dataset = tf.data.Dataset.from_tensor_slices((inputs, masks))
    dataset = dataset.batch(batch_size)
    embeddings = []
    for input_tensor, attention_mask in dataset:
        output = model([input_tensor, attention_mask])
        embeddings.append(output.pooler_output.numpy())
    return np.vstack(embeddings)

def tf_hub_embedder(df, col, model, batch_size=512):
    dataset = tf.data.Dataset.from_tensor_slices(df[col].values)
    dataset = dataset.batch(batch_size)
    embeddings = []
    for batch in dataset:
        embedding = model(batch)
        embeddings.append(embedding.numpy())
    return np.vstack(embeddings)

In [18]:
%%time
train_bertweet_embeddings = huggingface_embedder(train, 'text', bertweet_tokenizer, bertweet_model)
test_bertweet_embeddings = huggingface_embedder(test, 'text', bertweet_tokenizer, bertweet_model)

CPU times: user 16.1 s, sys: 3.85 s, total: 20 s
Wall time: 17.2 s


# Universal Sentence Encoder
### Encoder of greater-than-word length text trained on a variety of data. (https://tfhub.dev/google/universal-sentence-encoder/4)
Source: https://arxiv.org/abs/1803.11175

![](https://www.gstatic.com/aihub/tfhub/universal-sentence-encoder/example-similarity.png)

In [3]:
universal_sentence_encoder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 180.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 350.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 520.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 700.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 880.00MB
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


In [7]:
%%time
train_use4_embeddings = tf_hub_embedder(train, 'text', universal_sentence_encoder)
test_use4_embeddings = tf_hub_embedder(test, 'text', universal_sentence_encoder)

CPU times: user 2.1 s, sys: 338 ms, total: 2.44 s
Wall time: 1.7 s


In [11]:
with open('../data/features/train_use4_embeddings.npy', 'wb') as f:
    np.save(f, train_use4_embeddings)
with open('../data/features/test_use4_embeddings.npy', 'wb') as f:
    np.save(f, test_use4_embeddings)

# nnlm-en-dim128-with-normalization
### Text embedding based on feed-forward Neural-Net Language Models with pre-built OOV trained on English Google News 200B corpus.


### Maps from text to 128-dimensional embedding vectors. (https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2)
Source: https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

In [3]:
nnlm_en_128_norm = hub.load("https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2")

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2'.
INFO:absl:Downloading https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2: 180.00MB
INFO:absl:Downloading https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2: 360.00MB
INFO:absl:Downloaded https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2, Total size: 483.55MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2'.


In [5]:
%%time
train_nnlm_en_128_norm_embeddings = tf_hub_embedder(train, 'text', nnlm_en_128_norm)
test_nnlm_en_128_norm_embeddings = tf_hub_embedder(test, 'text', nnlm_en_128_norm)

CPU times: user 394 ms, sys: 62.4 ms, total: 457 ms
Wall time: 342 ms


In [9]:
with open('../data/features/train_nnlm_en_128_norm_embeddings.npy', 'wb') as f:
    np.save(f, train_nnlm_en_128_norm_embeddings)
with open('../data/features/test_nnlm_en_128_norm_embeddings.npy', 'wb') as f:
    np.save(f, test_nnlm_en_128_norm_embeddings)