In [1]:
 # @title
!pip install datasets
!pip install evaluate
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m

In [2]:
from transformers import BertTokenizer, TFBertModel
from google.colab import drive
from datasets import load_from_disk

from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
review_dataset = load_from_disk('/content/gdrive/MyDrive/arrow-review')
print(review_dataset)

DatasetDict({
    train: Dataset({
        features: ['review', 'label'],
        num_rows: 30000
    })
    validate: Dataset({
        features: ['review', 'label'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['review', 'label'],
        num_rows: 10000
    })
})


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
  return tokenizer(examples['review'], padding = 'max_length', truncation = True)

tokenized_datasets = review_dataset.map(tokenize_function, batched = True)
print(tokenized_datasets)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['review', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 30000
    })
    validate: Dataset({
        features: ['review', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['review', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
})


In [None]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)
training_args = TrainingArguments(output_dir = 'test_trainer', evaluation_strategy = 'epoch')

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis = -1)
  return metric.compute(predictions = predictions, references = labels)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['test'],
    compute_metrics = compute_metrics,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2958,0.220292,0.9239
2,0.1839,0.282344,0.9376
3,0.0663,0.332125,0.9385


TrainOutput(global_step=11250, training_loss=0.19913638017442492, metrics={'train_runtime': 9555.8494, 'train_samples_per_second': 9.418, 'train_steps_per_second': 1.177, 'total_flos': 2.36799949824e+16, 'train_loss': 0.19913638017442492, 'epoch': 3.0})

In [None]:
trainer.save_model('/content/gdrive/MyDrive/Intermediate Model/orig_size')

In [4]:
# THIS PART IS FOR AFTER TRAINING ON THE INTERMEDIATE TASK.
sarcasm_dataset = load_from_disk('/content/gdrive/MyDrive/arrow-sarcasm-small-subset')
print(sarcasm_dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'comment'],
        num_rows: 30000
    })
    validate: Dataset({
        features: ['label', 'comment'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['label', 'comment'],
        num_rows: 10000
    })
})


In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
  return tokenizer(examples['comment'], padding = 'max_length', truncation = True)

tokenized_datasets = sarcasm_dataset.map(tokenize_function, batched = True)
print(tokenized_datasets)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'comment', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 30000
    })
    validate: Dataset({
        features: ['label', 'comment', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['label', 'comment', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
})


In [None]:
# model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)
model = AutoModelForSequenceClassification.from_pretrained('/content/gdrive/MyDrive/Intermediate Model/orig_size', num_labels = 2)
training_args = TrainingArguments(output_dir = 'test_trainer', evaluation_strategy = 'epoch')

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis = -1)
  return metric.compute(predictions = predictions, references = labels)

trainer = Trainer(
    model = model,
    per_device_train_batch_size = 16,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['test'],
    compute_metrics = compute_metrics,
)

trainer.train()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model('/content/gdrive/MyDrive')

In [None]:
trainer.evaluate()

{'eval_loss': 1.2100824117660522,
 'eval_accuracy': 0.7182,
 'eval_runtime': 331.1956,
 'eval_samples_per_second': 30.194,
 'eval_steps_per_second': 3.774,
 'epoch': 3.0}