In [1]:
!pip install -q --upgrade --use-feature=2020-resolver datasets sklearn transformers==4.5.0

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import datasets

STORAGE_PATH = '/storage/imdb'

train = datasets.load_from_disk(f'{STORAGE_PATH}/train')
validation = datasets.load_from_disk(f'{STORAGE_PATH}/validation')
test = datasets.load_from_disk(f'{STORAGE_PATH}/test')

In [3]:
from datasets.arrow_dataset import Dataset


def report_target_distribution(dataset: Dataset):
    frame = dataset.to_pandas()
    counts = dict(frame['label'].value_counts())
    size = len(dataset)
    
    print('--- Target distribution ---\n')
    for label, count in counts.items():
        print(f'Label {label} - {count / size * 100:.2f}% ({count}) of examples')
    print(f'\nTotal size: {size}')
    
report_target_distribution(validation)

--- Target distribution ---

Label 1 - 50.41% (4726) of examples
Label 0 - 49.59% (4649) of examples

Total size: 9375


In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [5]:
from transformers.tokenization_utils import PaddingStrategy


def tokenize(review: str, label: int, max_length: int = 512) -> dict:
    return tokenizer(
      text=review,
      max_length=max_length,
      truncation=True,
      padding=PaddingStrategy.MAX_LENGTH,
      return_token_type_ids=False
    )


def remove_text_column(dataset: Dataset) -> Dataset:
    return dataset.remove_columns(['text'])


def preprocess(dataset: Dataset, tokenize: AutoTokenizer) -> Dataset:
    columns = list(dataset.features.keys())
    dataset = dataset.map(tokenize, input_columns=columns, batched=True)

    new_columns = list(dataset.features.keys() ^ columns)
    dataset.set_format('torch', columns=new_columns, output_all_columns=True)
    datset = remove_text_column(dataset)
    
    return dataset

In [6]:
train = train.select(range(20))
validation = validation.select(range(10))
test = test.select(range(20))

In [7]:
train = preprocess(train, tokenize=tokenize)
validation = preprocess(validation, tokenize=tokenize)
test = preprocess(test, tokenize=tokenize)

Loading cached processed dataset at /storage/imdb/train/cache-db6aaa155697bacc.arrow
Loading cached processed dataset at /storage/imdb/validation/cache-18f6abb5461754cd.arrow
Loading cached processed dataset at /storage/imdb/test/cache-2b1244c5930c2bba.arrow


In [8]:
import numpy as np
from datasets import load_metric

accuracy = load_metric('accuracy')

def compute_metrics(outputs):
    logits, labels = outputs
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
from transformers import Trainer, TrainingArguments
from datetime import datetime

DATE = datetime.now().strftime("%d-%m-%y")
MODEL_PATH = '/storage/models'


arguments = TrainingArguments(
    output_dir=f'{MODEL_PATH}/model-{DATE}',
    num_train_epochs=3,
    learning_rate=2e-5,
    evaluation_strategy='epoch',
    logging_strategy='epoch'
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=arguments,
    train_dataset=train,
    eval_dataset=validation,
    compute_metrics=compute_metrics
)

In [16]:
train_outputs = trainer.train()
test_outputs = trainer.predict(test)

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.3172,0.870007,0.5,0.2647,37.779
2,0.3243,0.870007,0.5,0.263,38.028
3,0.3338,0.870007,0.5,0.2667,37.501


AttributeError: 'Dataset' object has no attribute 'predict'

In [None]:
from sklearn.metrics import classification_report

logits = test_outputs