In [None]:
!pip install --upgrade datasets fsspec transformers

In [2]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [12]:
dataset = load_dataset("imdb")
trainingDataset = dataset["train"].select(range(1000))
testingDataset = dataset["test"].select(range(500))

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [18]:
trainingDataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [9]:
def tokenizeText(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

In [10]:
def preprocess(ds):
  ds = ds.map(tokenizeText, batched = True, remove_columns = ["text"]) # Removing raw text
  ds = ds.rename_column("label", "labels")
  ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
  return ds

In [None]:
trainingDataset = preprocess(trainingDataset)
testingDataset = preprocess(testingDataset)

In [16]:
trainingDataset[0]

{'labels': tensor(0),
 'input_ids': tensor([  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,  2026,
          2678,  3573,  2138,  1997,  2035,  1996,  6704,  2008,  5129,  2009,
          2043,  2009,  2001,  2034,  2207,  1999,  3476,  1012,  1045,  2036,
          2657,  2008,  2012,  2034,  2009,  2001,  8243,  2011,  1057,  1012,
          1055,  1012,  8205,  2065,  2009,  2412,  2699,  2000,  4607,  2023,
          2406,  1010,  3568,  2108,  1037,  5470,  1997,  3152,  2641,  1000,
          6801,  1000,  1045,  2428,  2018,  2000,  2156,  2023,  2005,  2870,
          1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,
          5436,  2003,  8857,  2105,  1037,  2402,  4467,  3689,  3076,  2315,
         14229,  2040,  4122,  2000,  4553,  2673,  2016,  2064,  2055,  2166,
          1012,  1999,  3327,  2016,  4122,  2000,  3579,  2014,  3086,  2015,
          2000,  2437,  2070,  4066,  1997,  4516,  2006,  2054,  1996,  2779,
         25430, 1

In [19]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [None]:
# # Finetune on last two layers

# for param in model.bert.embeddings.parameters():
#   param.requires_grad = False;

# # Unfreeze last 2 encoder layers
# for layer in model.bert.encoder.layer[-2:]:
#   for param in layer.parameters():
#     param.requires_grad = True;

In [30]:
from transformers import TrainingArguments
trainingArgs = TrainingArguments(
    output_dir = "./bert-finetuned-imdb",
    num_train_epochs= 1,
    per_device_train_batch_size = 8,
    logging_dir = "./logs",
    learning_rate= 2e-5,
    weight_decay= 0.01,
    report_to= "none"
)

`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


In [25]:
from transformers import Trainer

In [33]:
trainer = Trainer(
    model = model,
    args = trainingArgs,
    train_dataset = trainingDataset,
    eval_dataset = testingDataset
)

trainer.train()

Step,Training Loss


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=125, training_loss=2.185106131946668e-07, metrics={'train_runtime': 69.092, 'train_samples_per_second': 14.473, 'train_steps_per_second': 1.809, 'total_flos': 131555527680000.0, 'train_loss': 2.185106131946668e-07, 'epoch': 1.0})

In [None]:
trainer.save_model("./bert-finetuned-imdb")
tokenizer.save_pretrained("./bert-finetuned-imdb")

In [35]:
metrics = trainer.evaluate()

In [36]:
metrics

{'eval_loss': 1.1920927533992653e-07,
 'eval_runtime': 7.551,
 'eval_samples_per_second': 66.216,
 'eval_steps_per_second': 8.343,
 'epoch': 1.0}

### Prediction

In [None]:
tokenizer = BertTokenizer.from_pretrained("/content/bert-finetuned-imdb")
model = BertForSequenceClassification.from_pretrained("/content/bert-finetuned-imdb")

In [40]:
from transformers import pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [42]:
text = "This was the worst movie I ever watched."
result = classifier(text)
result

[{'label': 'LABEL_0', 'score': 0.9999998807907104}]

In [44]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
tokenizer.push_to_hub("amxn18/bert-finetuned-imdb-v1")

In [None]:
trainer.push_to_hub("amxn18/bert-finetuned-imdb-v1")