In [1]:
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, DataCollatorWithPadding

In [2]:
imdb = load_from_disk("../data/imdb")

In [6]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
max_sequence_length = 128
batch_size = 32
eval_steps = 100
learning_rate=2e-05
num_train_epochs=5
output_dir = "../output/"
model_dir = "../models/"
early_stopping_patience = 10

In [7]:
def tokenize_function(example):
    return tokenizer(example["text"],  truncation=True, padding="max_length", max_length=max_sequence_length )


tokenized_datasets = imdb.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at ../data/imdb/train/cache-7612cc8d777ef0a6.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at ../data/imdb/test/cache-4f03edd16e5500a5.arrow


In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(["text", ])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    dev: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 24000
    })
})

In [10]:
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification
model_finetuned = BertForSequenceClassification.from_pretrained("artemis13fowl/bert-base-uncased-imdb")

In [11]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
import numpy as np

In [12]:
def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [13]:
training_args = TrainingArguments(
   output_dir+"bert-base-uncased-imdb",
   evaluation_strategy ='steps',
   eval_steps = eval_steps , # Evaluation and Save happens every eval_steps steps
   save_total_limit = 1, # Only last  model is saved. Older ones are deleted.
   learning_rate=learning_rate,
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=batch_size,
   num_train_epochs=num_train_epochs,
   metric_for_best_model = 'f1',
   load_best_model_at_end=True)

In [14]:
trainer_eval = Trainer(
    model_finetuned,
    training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [15]:
tokenized_datasets["test"]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 24000
})

In [16]:
predictions = trainer_eval.predict(tokenized_datasets["test"].shuffle().select(range(1000)))
print(predictions.predictions.shape, predictions.label_ids.shape)
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(predictions.label_ids, preds))

(1000, 2) (1000,)
              precision    recall  f1-score   support

           0       0.91      0.86      0.89       511
           1       0.86      0.92      0.89       489

    accuracy                           0.89      1000
   macro avg       0.89      0.89      0.89      1000
weighted avg       0.89      0.89      0.89      1000



In [None]:
from huggingface_hub import create_repo
create_repo("bert-base-uncased-imdb")

In [None]:
from huggingface_hub import upload_file

upload_file(
    model_dir+"bert-base-uncased-imdb"+"/config.json",
    path_in_repo="config.json",
    repo_id="artemis13fowl/bert-base-uncased-imdb",
)

In [None]:
from huggingface_hub import Repository

repo = Repository("huggingface_repo1", clone_from="artemis13fowl/bert-base-uncased-imdb")

In [None]:
repo.git_pull()

In [None]:
model_finetuned.save_pretrained("huggingface_repo1")

In [None]:
repo.git_add()
repo.git_commit("Add bert-base-uncased-imdb")
repo.git_push()