In [3]:
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification

In [18]:
imdb = load_from_disk("../data/imdb")

In [19]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 24000
    })
})

In [20]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [21]:
max_sequence_length = 128
batch_size = 32
eval_steps = 100
learning_rate=2e-05
num_train_epochs=5
output_dir = "../output/"
model_dir = "../models/"
early_stopping_patience = 10

In [22]:
def tokenize_function(example):
    return tokenizer(example["text"],  truncation=True, padding="max_length", max_length=max_sequence_length )


tokenized_datasets = imdb.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at ../data/imdb/train/cache-b131088925272670.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at ../data/imdb/test/cache-62e2edbeddae2500.arrow


In [74]:
tokenized_datasets = tokenized_datasets.remove_columns(["text", ])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [15]:
# from torch.utils.data import DataLoader

# train_dataloader = DataLoader(
#     tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
# )
# dev_dataloader = DataLoader(
#     tokenized_datasets["dev"], batch_size=16, collate_fn=data_collator
# )

In [46]:
# for batch in train_dataloader:
#     break
# {k: v.shape for k, v in batch.items()}

In [75]:
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [76]:
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np

In [78]:
# def compute_metrics(eval_preds):
#     metric = load_metric("imdb)
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [87]:

# training_args = TrainingArguments('bert-base-uncased-imdb', 
#                                   learning_rate=2e-05,
#                                   per_device_train_batch_size =batch_size, 
#                                   per_device_eval_batch_size =batch_size,

#                           )

training_args = TrainingArguments(
   output_dir+"bert-base-uncased-imdb",
   evaluation_strategy ='steps',
   eval_steps = eval_steps , # Evaluation and Save happens every eval_steps steps
   save_total_limit = 1, # Only last  model is saved. Older ones are deleted.
   learning_rate=learning_rate,
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=batch_size,
   num_train_epochs=num_train_epochs,
   metric_for_best_model = 'f1',
   load_best_model_at_end=True)

In [2]:
from transformers import Trainer, EarlyStoppingCallback

In [89]:

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
    compute_metrics=compute_metrics,
)


In [90]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Runtime,Samples Per Second
100,No log,0.650065,0.646,0.678571,0.538462,0.600451,13.519,73.97


TrainOutput(global_step=125, training_loss=0.6922564086914063, metrics={'train_runtime': 117.4235, 'train_samples_per_second': 1.065, 'total_flos': 3284513340000, 'epoch': 1.0})

In [92]:
model.save_pretrained(model_dir+"bert-base-uncased-imdb")

In [94]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(1000, 2) (1000,)


In [101]:
preds = np.argmax(predictions.predictions, axis=-1)

In [103]:
from sklearn.metrics import classification_report
print(classification_report(predictions.label_ids, preds))

              precision    recall  f1-score   support

           0       0.62      0.75      0.68       506
           1       0.68      0.54      0.60       494

    accuracy                           0.65      1000
   macro avg       0.65      0.64      0.64      1000
weighted avg       0.65      0.65      0.64      1000



In [6]:
model_finetuned = BertForSequenceClassification.from_pretrained(model_dir+"bert-base-uncased-imdb")
model_finetuned.config

BertConfig {
  "_name_or_path": "../models/bert-base-uncased-imdb",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.3.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [7]:
from huggingface_hub import create_repo
create_repo("bert-base-uncased-imdb")

'https://huggingface.co/artemis13fowl/bert-base-uncased-imdb'

In [8]:
from huggingface_hub import upload_file

upload_file(
    model_dir+"bert-base-uncased-imdb"+"/config.json",
    path_in_repo="config.json",
    repo_id="artemis13fowl/bert-base-uncased-imdb",
)

'https://huggingface.co/artemis13fowl/bert-base-uncased-imdb/blob/main/config.json'

In [11]:
from huggingface_hub import Repository

repo = Repository("huggingface_repo", clone_from="artemis13fowl/bert-base-uncased-imdb")

Cloning https://huggingface.co/artemis13fowl/bert-base-uncased-imdb into local empty directory.


In [12]:
repo.git_pull()

In [15]:
model_finetuned.save_pretrained("huggingface_repo")


NameError: name 'tokenizer' is not defined

In [16]:
repo.git_add()
repo.git_commit("Add bert-base-uncased-imdb")
repo.git_push()

Upload file pytorch_model.bin:   0%|          | 32.0k/418M [00:00<?, ?B/s]

To https://huggingface.co/artemis13fowl/bert-base-uncased-imdb
   bbacc7e..afc6cbd  main -> main



'https://huggingface.co/artemis13fowl/bert-base-uncased-imdb/commit/afc6cbd6de6920ed2987ef3542a31d10dfb3e161'

In [17]:
model_finetuned = BertForSequenceClassification.from_pretrained("artemis13fowl/bert-base-uncased-imdb")
model_finetuned.config

Downloading:   0%|          | 0.00/634 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

BertConfig {
  "_name_or_path": "artemis13fowl/bert-base-uncased-imdb",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.3.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [24]:
trainer_eval = Trainer(
    model_finetuned,
)
predictions = trainer_eval.predict(tokenized_datasets["test"])

In [26]:
import numpy as np
from sklearn.metrics import classification_report
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(predictions.label_ids, preds))

              precision    recall  f1-score   support

           0       0.54      0.82      0.65     11994
           1       0.62      0.29      0.39     12006

    accuracy                           0.56     24000
   macro avg       0.58      0.56      0.52     24000
weighted avg       0.58      0.56      0.52     24000

