In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
import nltk
import re
import os
from sklearn import preprocessing
from keras import backend as K
from sklearn.metrics import accuracy_score
from transformers import TFXLNetModel, XLNetTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/ayoub/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
PATH = os.getcwd()
DATA_PATH = PATH + '/data/'
MODEL_PATH = PATH + '/models/'

In [3]:
def xlnet_model():

    word_inputs = tf.keras.Input(shape=(128,), name='word_inputs', dtype='int32')
    xlnet = TFXLNetModel.from_pretrained('xlnet-base-cased')
    xlnet_encodings = xlnet(word_inputs)[0]
    doc_encoding = tf.squeeze(xlnet_encodings[:, -1:, :], axis=1)
    doc_encoding = tf.keras.layers.Dropout(.1)(doc_encoding)
    outputs = tf.keras.layers.Dense(10, activation='softmax', name='outputs')(doc_encoding)
    model = tf.keras.Model(inputs=[word_inputs], outputs=[outputs])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

    return model

In [4]:
def tokenize(text, tokenizer, max_len=128):
    inps = [tokenizer.encode_plus(t, max_length=max_len, truncation=True, padding='max_length', add_special_tokens=True) for t in text]
    inp_tok = np.array([a['input_ids'] for a in inps])
    ids = np.array([a['attention_mask'] for a in inps])
    segments = np.array([a['token_type_ids'] for a in inps])
    return inp_tok, ids, segments

In [5]:
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

In [6]:
xlnet = xlnet_model()

Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLNetModel were initialized from the model checkpoint at xlnet-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetModel for predictions without further training.


In [7]:
df_train = pd.read_csv(DATA_PATH+'train.csv')
df_train['text'] = df_train['text'].str.replace(r'[^\w\s]+', '')
df_train['text'] = df_train['text'].apply(
        lambda x: ' '.join([word for word in x.split() if word not in stop]))

df_test = pd.read_csv(DATA_PATH+'test.csv')
df_test['text'] = df_test['text'].str.replace(r'[^\w\s]+', '')
df_test['text'] = df_test['text'].apply(
        lambda x: ' '.join([word for word in x.split() if word not in stop]))

  df_test['text'] = df_test['text'].str.replace(r'[^\w\s]+', '')


In [None]:
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(df_train['labels'])
y_test = le.fit_transform(df_test['labels'])

y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)
np.save(DATA_PATH+'encoder.npy', le.classes_)

In [9]:
X_train = df_train['text']
X_test = df_test['text']

In [None]:
inp_tok, ids, segments = tokenize(X_train, xlnet_tokenizer)

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=4, min_delta=0.02, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=1e-6, patience=2, verbose=0, mode='auto', min_delta=0.001, cooldown=0, min_lr=1e-6)
]

In [None]:
xlnet.fit(x=inp_tok, y=y_train, epochs=2, batch_size=8, validation_split=.1, callbacks=callbacks)

In [None]:
xlnet.save_weights(MODEL_PATH+"xlnet.h5")

In [None]:
K.set_value(xlnet.optimizer.learning_rate, 1e-5)
xlnet.fit(x=inp_tok, y=y_train, epochs=1, batch_size=32, validation_split=.1, callbacks=callbacks)

In [None]:
xlnet.save_weights(MODEL_PATH+"xlnet_finetuned.h5")

In [None]:
K.set_value(xlnet.optimizer.learning_rate, 1e-6)
xlnet.fit(x=inp_tok, y=y_train, epochs=1, batch_size=32, validation_split=.1, callbacks=callbacks)

In [None]:
xlnet.save_weights(MODEL_PATH+"xlnet_finetuned.h5")

In [None]:
K.set_value(xlnet.optimizer.learning_rate, 5e-7)
xlnet.fit(x=inp_tok, y=y_train, epochs=1, batch_size=32, validation_split=.1, callbacks=callbacks)

In [None]:
K.set_value(xlnet.optimizer.learning_rate, 1e-7)
xlnet.fit(x=inp_tok, y=y_train, epochs=2, batch_size=32, validation_split=.1, callbacks=callbacks)

In [None]:
xlnet.save_weights(MODEL_PATH+"xlnet_finetuned.h5")

In [10]:
xlnet.load_weights(MODEL_PATH+"xlnet_finetuned.h5")

In [11]:
inp_tok, ids, segments = tokenize(X_test, xlnet_tokenizer)

In [12]:
encoder = preprocessing.LabelEncoder()
encoder.classes_ = np.load(MODEL_PATH+'encoder.npy', allow_pickle=True)
y_test = encoder.transform(df_test['labels'])

In [33]:
preds = xlnet.predict(inp_tok, verbose=True)

2021-12-06 13:37:17.999028: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


 160/4295 [>.............................] - ETA: 3:23:56

KeyboardInterrupt: 

In [21]:
preds = np.argmax(preds,axis=1)

In [22]:
accuracy = accuracy_score(y_test, preds)
print(accuracy)

[6 0 7 7 6 1 7 0 7 8 7 9 5 2 6 6 6 7 0 4 7 6 7 8 7 7 0 2 0 0 7 7 5 7 7 6 7
 5 0 1 2 5 8 7 0 1 1 0 7 6]
