# Based on Hugging Face Transformers

https://github.com/huggingface/transformers

# Preprocessing

In [1]:
import transformers as ppb
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
import torch
import math

Using TensorFlow backend.


In [None]:
# For DistilBERT:
#model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-cased')

## Want BERT instead of distilBERT? Uncomment the following line:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-cased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
df = pd.read_csv("../datasets/SENT/train.tsv", sep="\t", header=None)
test_df = pd.read_csv("../datasets/SENT/test.tsv", sep="\t", header=None)

In [None]:
# Truncate long sentences to 128 tokens
X = df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=128)))
y = np.array(df[1])
del df

X_test = test_df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=128)))
y_test = np.array(test_df[1])
del test_df

In [None]:
# One hot Encoding of y
encoder = LabelEncoder()
encoder.fit(y)

y = encoder.transform(y)
y = to_categorical(y)

# One hot Encoding of y test
y_oh = encoder.transform(y_test)
y_oh = to_categorical(y_oh)

In [None]:
def GetEmbeddings(tokenizedBatch):
    max_len = 0
    for i in tokenizedBatch.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenizedBatch.values])
    
    attention_mask = np.where(padded != 0, 1, 0)
    
    input_ids = torch.tensor(padded).to(torch.long)  
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    
    features = last_hidden_states[0][:,0,:].numpy()
    return features

In [None]:
BATCH_SIZE = 1000
all_embeddings = []
all_embeddings_test = []

# Process Training Set Embeddings

batches = math.ceil(X.shape[0] / BATCH_SIZE)

for i in range(1, batches+1):
    print("Generating Embeddings for Batch:",i,"of", batches)
    batchEmbeddings = GetEmbeddings(X[(i-1)*BATCH_SIZE:i*BATCH_SIZE])
    
    all_embeddings.append(batchEmbeddings)

# Process Test Set Embeddings
    
batches = math.ceil(X_test.shape[0] / BATCH_SIZE)

for i in range(1, batches+1):
    print("Generating Test Embeddings for Batch:",i,"of", batches)
    batchEmbeddings = GetEmbeddings(X_test[(i-1)*BATCH_SIZE:i*BATCH_SIZE])
    
    all_embeddings_test.append(batchEmbeddings)

In [None]:
all_embeddings = np.concatenate(all_embeddings, axis=0)

In [None]:
all_embeddings_test = np.concatenate(all_embeddings_test, axis=0)

In [None]:
np.save('../binary/bert_embeddings.npy', all_embeddings)
np.save('../binary/y.npy', y)
np.save('../binary/bert_embeddings_test.npy', all_embeddings_test)
np.save('../binary/y_test.npy', y_oh)

# Train Model

In [None]:
import tensorflow as tf
import tensorflow.keras as keras 
from keras.layers import Input, Lambda, Dense
from keras.models import Model
from keras.callbacks import EarlyStopping
import keras.backend as K

In [None]:
all_embeddings = np.load('../binary/bert_embeddings.npy')
y = np.load('../binary/y.npy')
all_embeddings_test = np.load('../binary/bert_embeddings_test.npy')
y_oh = np.load('../binary/y_test.npy')

In [None]:
def build_model(): 
    embedding = Input(shape=(768,), dtype="float")
    #embedding = Lambda(GetEmbeddings, output_shape=(768, ))(input_text)
    dense1 = Dense(1000, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(embedding)
    dense2 = Dense(1000, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(dense1)
    dense3 = Dense(1000, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(dense2)
    dense4 = Dense(1000, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(dense3)
    dense5 = Dense(1000, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(dense4)
    dense6 = Dense(1000, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(dense5)
    dense7 = Dense(1000, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(dense6)
    dense8 = Dense(1000, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(dense7)
    dense9 = Dense(1000, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(dense8)
    dense10 = Dense(1000, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(dense9)
    pred = Dense(3, activation='sigmoid')(dense9)
    model = Model(inputs=[embedding], outputs=pred)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model_bert = build_model()

In [None]:
model_bert.summary()

# Change to training set

In [None]:
es = EarlyStopping(monitor='loss', patience=50)
cb_list = [es]

In [None]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())  
    session.run(tf.tables_initializer())
    history = model_bert.fit(all_embeddings, y, epochs=1000, batch_size=10000, 
                             validation_split = 0.001, callbacks=cb_list)
    model_bert.save_weights('../model/bert_logistic/model_bert_weights.h5')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

acc = history.history['acc']
loss = history.history['loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'g', label='Training Acc')
plt.title('Training and validation Acc')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()

plt.show()

# Predict on Test Data

In [None]:
import math
import time

In [None]:
bs = 10000
batches = math.ceil(all_embeddings_test.shape[0] / bs)

In [None]:
all_embeddings_test, y_test

In [None]:
# # Truncate super long reviews
# new_X_test = [x[:2000] for x in X_test]

In [None]:
all_preds = []
all_probs = []
    
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())  
    session.run(tf.tables_initializer())
    model_bert.load_weights('../model/bert_logistic/model_bert_weights.h5')

    for i in range(1,batches+1):
        print("Predicting Batch",i)
        new_text_pr = all_embeddings_test[(i-1)*bs:i*bs]
        preds = model_bert.predict(new_text_pr)
        all_probs.append(preds)
        preds = encoder.inverse_transform(np.argmax(preds,axis=1))
        all_preds.append(preds)

In [None]:
results = np.concatenate(all_preds, axis=0)
results_probs = np.concatenate(all_probs, axis=0)

In [None]:
np.savetxt("../output/bert_logistic/test_results.tsv", results_probs, delimiter="\t")

In [None]:
np.savetxt("../output/bert_logistic/test_predictions.tsv", results, delimiter="\t")

In [None]:
print("Accuracy: ",sum(results==y_test)/results.shape[0])

In [None]:
y_test

In [None]:
results == y_test