In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%cd gdrive/MyDrive/colab_projects/nlp/imdb/nbs/

/content/gdrive/MyDrive/colab_projects/nlp/imdb/nbs


In [3]:
!pip install transformers



In [4]:
!pip install sentencepiece



In [5]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import tensorflow as tf
from transformers import XLNetTokenizer, XLNetConfig, TFXLNetModel

from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

In [6]:
pd.options.display.max_columns=999
pd.options.display.max_rows=999

In [7]:
df = pd.read_csv("../data/imdb.csv")
df_train = df[:25000]
df_test = df[25000:]

labels_index = {'positive':1, 'negative':0}

In [8]:
model_name = 'xlnet-base-cased'
max_length = 500
config = XLNetConfig.from_pretrained(model_name)
config.output_hidden_states = False
tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path=model_name, config=config)
transformer_model = TFXLNetModel.from_pretrained(model_name, config=config)

Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLNetModel were initialized from the model checkpoint at xlnet-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetModel for predictions without further training.


In [9]:
transformer_model.summary()

Model: "tfxl_net_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
transformer (TFXLNetMainLaye multiple                  116718336 
Total params: 116,718,336
Trainable params: 116,718,336
Non-trainable params: 0
_________________________________________________________________


In [10]:
len(transformer_model.layers)

1

In [11]:
xlnet = transformer_model.layers[0]

In [12]:
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, Flatten
from keras.models import Model

In [13]:
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
xlnet_model = xlnet(inputs)[0]
dense_inter = Flatten()(xlnet_model)
logits = Dense(2)(dense_inter)
model = Model(inputs=inputs, outputs=logits)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 500)]             0         
_________________________________________________________________
transformer (TFXLNetMainLaye TFXLNetModelOutput(last_h 116718336 
_________________________________________________________________
flatten (Flatten)            (None, 384000)            0         
_________________________________________________________________
dense (Dense)                (None, 2)                 768002    
Total params: 117,486,338
Trainable params: 117,486,338
Non-trainable params: 0
_________________________________________________________________


In [14]:
optimizer = Adam(learning_rate=1e-05, epsilon=1e-08, decay=0.01, clipnorm=1.0)
loss = CategoricalCrossentropy(from_logits=True)
metric = 'accuracy'
model.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [15]:
y_train = to_categorical(df_train['sentiment'].map(labels_index))

In [16]:
y_test = to_categorical(df_test['sentiment'].map(labels_index))

In [17]:
tokenizer.pad_token='[PAD]'

In [18]:
X_train = tokenizer(text=df_train['review'].to_list(),
                    add_special_tokens=True,
                    max_length=max_length,
                    truncation=True,
                    padding=True, 
                    return_tensors='tf',
                    return_token_type_ids = False,
                    return_attention_mask = False,
                    verbose = True)

In [19]:
X_test = tokenizer(text=df_test['review'].to_list(),
                    add_special_tokens=True,
                    max_length=max_length,
                    truncation=True,
                    padding=True, 
                    return_tensors='tf',
                    return_token_type_ids = False,
                    return_attention_mask = False,
                    verbose = True)

In [20]:
X_train

{'input_ids': <tf.Tensor: shape=(25000, 500), dtype=int32, numpy=
array([[   0,    0,    0, ...,    9,    4,    3],
       [   0,    0,    0, ...,    9,    4,    3],
       [   0,    0,    0, ...,    9,    4,    3],
       ...,
       [5039, 1444, 2844, ...,   33,    4,    3],
       [   0,    0,    0, ...,    9,    4,    3],
       [   0,    0,    0, ...,    9,    4,    3]], dtype=int32)>}

In [21]:
y_train

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [22]:
model.fit(X_train["input_ids"], y_train, batch_size=4, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f470808af90>

In [24]:
score, acc = model.evaluate(X_test["input_ids"], y_test)
print('Test accuracy:', acc)

Test accuracy: 0.9370399713516235
