In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import numpy as np
import tensorflow as tf
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report

In [2]:
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [19]:
"""
The Corpus of Linguistic Acceptability consists of English acceptability judgments 
drawn from books and journal articles on linguistic theory. 
Each example is a sequence of words 
annotated with whether it is a grammatical English sentence.
"""
raw_datasets = load_dataset('glue', 'cola')
raw_datasets

Downloading and preparing dataset glue/cola (download: 368.14 KiB, generated: 596.73 KiB, post-processed: Unknown size, total: 964.86 KiB) to C:\Users\49397\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=376971.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset glue downloaded and prepared to C:\Users\49397\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [20]:
raw_datasets['train'][0]

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0}

In [22]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_dataset(dataset):
    encoded = tokenizer(
        dataset['sentence'],
        padding=True,
        truncation=True,
        return_tensors='np',
    )
    return encoded.data

tokenized_datasets = {split: tokenize_dataset(raw_datasets[split]) for split in raw_datasets.keys()}
tokenized_datasets

{'train': {'input_ids': array([[ 101, 2256, 2814, ...,    0,    0,    0],
         [ 101, 2028, 2062, ...,    0,    0,    0],
         [ 101, 2028, 2062, ...,    0,    0,    0],
         ...,
         [ 101, 2009, 2003, ...,    0,    0,    0],
         [ 101, 1045, 2018, ...,    0,    0,    0],
         [ 101, 2054, 2035, ...,    0,    0,    0]]),
  'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]]),
  'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         ...,
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0]])},
 'validation': {'input_ids': array([[  101,  1996, 11279, ...,     0,     0,     0],
         [  101,  1996, 15871, ...,     0,     0,     0],
         [  101,  1996,  6228,

In [23]:
batch_size = 8 # if batch size is too big, it will cause OOM on GPU
num_epochs = 5
num_train_steps = (len(tokenized_datasets['train']['input_ids']) // batch_size) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, # transformer models benefit from a much lower learning rate than the default for Adam
    end_learning_rate=0.,
    decay_steps=num_train_steps
    )
optimizer = Adam(learning_rate=lr_scheduler)

In [24]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [26]:
model.fit(
    tokenized_datasets['train'],
    np.array(raw_datasets['train']['label']),
    validation_data=(tokenized_datasets['validation'], np.array(raw_datasets['validation']['label'])),
    batch_size=batch_size,
    epochs=num_epochs
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1d35e2edfd0>

In [40]:
preds = model.predict(tokenized_datasets['validation'])['logits']

In [58]:
y_pred = np.argmax(preds, axis=1)
y_true = raw_datasets['validation']['label']
print(classification_report(y_true, y_pred, target_names=['nongrammatical', 'grammatical']))

                precision    recall  f1-score   support

nongrammatical       0.78      0.57      0.66       322
   grammatical       0.83      0.93      0.87       721

      accuracy                           0.82      1043
     macro avg       0.80      0.75      0.77      1043
  weighted avg       0.81      0.82      0.81      1043

