Before we can use the rest of the notebook, we need to install the dependencies: this example uses transformers. To use TPUs on Colab, we need to install torch_xla and the last line installs accelerate from source.

# Import of Libraries

In [None]:
!pip --version

pip 19.3.1 from /usr/local/lib/python3.7/dist-packages/pip (python 3.7)


Make sure that we connect to TPU.

In [None]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

We need to install 🤗 Accelerate in a virtual environment. 

In [None]:
! pip install virtualenv

Collecting virtualenv
[?25l  Downloading https://files.pythonhosted.org/packages/03/08/f819421002e85a71d58368f7bffbe0b1921325e0e8ca7857cb5fb0e1f7c1/virtualenv-20.4.7-py2.py3-none-any.whl (7.2MB)
[K     |████████████████████████████████| 7.2MB 5.3MB/s 
Collecting distlib<1,>=0.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/87/26/f6a23dd3e578132cf924e0dd5d4e055af0cd4ab43e2a9f10b7568bfb39d9/distlib-0.3.2-py2.py3-none-any.whl (338kB)
[K     |████████████████████████████████| 348kB 35.6MB/s 
Installing collected packages: distlib, virtualenv
Successfully installed distlib-0.3.2 virtualenv-20.4.7


Check virtual environment version.

In [None]:
!virtualenv --version

virtualenv 20.4.7 from /usr/local/lib/python3.7/dist-packages/virtualenv/__init__.py


In [None]:
!virtualenv my_project

created virtual environment CPython3.7.10.final.0-64 in 819ms
  creator CPython3Posix(dest=/content/my_project, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/root/.local/share/virtualenv)
    added seed packages: pip==21.1.2, setuptools==57.0.0, wheel==0.36.2
  activators BashActivator,CShellActivator,FishActivator,PowerShellActivator,PythonActivator,XonshActivator


In [None]:
!source my_project/bin/activate

Installing datasets and transfomers as well as Accelerate.

In [None]:
! pip install datasets transformers
! pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
# ! pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8.1-cp37-cp37m-linux_x86_64.whl
! pip install git+https://github.com/huggingface/accelerate
# ! pip install accelerate

Collecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/86/27/9c91ddee87b06d2de12f134c5171a49890427e398389f07f6463485723c3/datasets-1.9.0-py3-none-any.whl (262kB)
[K     |████████████████████████████████| 266kB 5.8MB/s 
[?25hCollecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 34.2MB/s 
Collecting fsspec>=2021.05.0
[?25l  Downloading https://files.pythonhosted.org/packages/0e/3a/666e63625a19883ae8e1674099e631f9737bd5478c4790e5ad49c5ac5261/fsspec-2021.6.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 42.9MB/s 
Collecting xxhash
[?25l  Downloading https://files.pythonhosted.org/packages/7d/4f/0a862cad26aa2ed7a7cd87178cbbfa824fc1383e472d63596a0d018374e7/xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243kB)
[K     |████████████████████████████

To check transformers is properly installed, we run the following command:

In [None]:
from transformers import pipeline
print(pipeline('sentiment-analysis')('we love you'))



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267844284.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=48.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


[{'label': 'POSITIVE', 'score': 0.9998704791069031}]


Here are all the imports we will need for this notebook.

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

from accelerate import Accelerator, DistributedType
from datasets import load_dataset, load_metric
from transformers import (
    AdamW,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    set_seed,
)

from tqdm.auto import tqdm



# Import of Data

Download data from Kaggle 

In [None]:
import os

# Download Kaggle Dataset via the Kaggle API

kaggle_api = {"username":"aygulzagidullina","key":"da122ee89f2edede46cc7dc5d55d55ba"}

import json
with open('/content/kaggle.json', 'w') as file:
    json.dump(kaggle_api, file)

!chmod 600 /content/kaggle.json
os.environ['KAGGLE_CONFIG_DIR'] = "/content"

!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification

!unzip \*.zip  && rm *.zip

Downloading sample_submission.csv.zip to /content
  0% 0.00/224k [00:00<?, ?B/s]
100% 224k/224k [00:00<00:00, 35.9MB/s]
Downloading test_private_expanded.csv.zip to /content
 32% 5.00M/15.8M [00:00<00:00, 49.8MB/s]
100% 15.8M/15.8M [00:00<00:00, 76.5MB/s]
Downloading toxicity_individual_annotations.csv.zip to /content
100% 64.7M/64.7M [00:00<00:00, 113MB/s]

Downloading identity_individual_annotations.csv.zip to /content
 65% 8.00M/12.3M [00:00<00:00, 75.9MB/s]
100% 12.3M/12.3M [00:00<00:00, 78.3MB/s]
Downloading test.csv.zip to /content
 41% 5.00M/12.1M [00:00<00:00, 43.1MB/s]
100% 12.1M/12.1M [00:00<00:00, 59.4MB/s]
Downloading test_public_expanded.csv.zip to /content
 57% 9.00M/15.9M [00:00<00:00, 24.2MB/s]
100% 15.9M/15.9M [00:00<00:00, 29.4MB/s]
Downloading train.csv.zip to /content
 96% 266M/276M [00:02<00:00, 94.9MB/s]
100% 276M/276M [00:02<00:00, 102MB/s] 
Downloading all_data.csv.zip to /content
 96% 313M/326M [00:03<00:00, 93.2MB/s]
100% 326M/326M [00:03<00:00, 99.8MB/s]
Arch

Load train dataset, split with 10% for the validation in the training procedure.

In [None]:
from datasets import load_dataset

dataset_dict_train = load_dataset('csv', data_files='/content/train.csv')
dataset_train = dataset_dict_train['train']




Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-5ec55c99d77ea78c/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-5ec55c99d77ea78c/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23. Subsequent calls will reuse this data.


*Select* all rows

In [None]:
 dataset_train = dataset_train.train_test_split(test_size=0.1) 

Create copy of the column 'target', i.e. 'labels'

In [None]:
dataset_train = dataset_train.map(lambda batch: {'labels': batch['target']}, batched = True)

HBox(children=(FloatProgress(value=0.0, max=1625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=181.0), HTML(value='')))




In [None]:
dataset_train = dataset_train.map(lambda example: {'labels': 1 if example['labels'] >= 0.5 else 0})

HBox(children=(FloatProgress(value=0.0, max=1624386.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=180488.0), HTML(value='')))




Change the float64 type of 'labels' column to int64

In [None]:
from datasets import ClassLabel, Value

new_features = dataset_train['train'].features.copy()

new_features["labels"] = Value('int64') 

dataset_train['train'] = dataset_train['train'].cast(new_features)

HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))




Change the int64 type to ClassLabel for 'labels' column

In [None]:
from datasets import ClassLabel, Value

new_features = dataset_train['train'].features.copy()

new_features["labels"] = ClassLabel(names=['non toxic', 'toxic'])  

dataset_train['train'] = dataset_train['train'].cast(new_features)

HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))




Function to print 2 examples of the data

In [None]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=2):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


In [None]:
show_random_elements(dataset_train["train"])

Unnamed: 0,article_id,asian,atheist,bisexual,black,buddhist,christian,comment_text,created_date,disagree,female,funny,heterosexual,hindu,homosexual_gay_or_lesbian,id,identity_annotator_count,identity_attack,insult,intellectual_or_learning_disability,jewish,labels,latino,likes,male,muslim,obscene,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,parent_id,physical_disability,psychiatric_or_mental_illness,publication_id,rating,sad,severe_toxicity,sexual_explicit,target,threat,toxicity_annotator_count,transgender,white,wow
0,384250,,,,,,,"I'm with everyone below...how can they not respond/appear ? I know Trump is up to his ass in alligators, but is he so busy he can't order this, or has kelly etal totally walled him off from whats happening...",2017-10-02 19:23:07.211790+00,0,,0,,,,6069473,0,0.0,0.180328,,,non toxic,,0,,,0.163934,,,,,,,,,105,approved,0,0.016393,0.131148,0.245902,0.016393,61,,,0
1,165398,,,,,,,"LOL! So people disapprove of my questions? Yeah, let's not ask questions, that might clarify things, can't have that...",2017-02-01 17:42:47.612270+00,2,,0,,,,933542,0,0.0,0.0,,,non toxic,,0,,,0.0,,,,,,932396.0,,,54,approved,0,0.0,0.0,0.0,0.0,4,,,0


Choose the tokenizer 

In [None]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




Function to clean text: remove stopwords, punctuation, lemmatize etc

In [None]:
from bs4 import BeautifulSoup # Text Cleaning
import re, string # Regular Expressions, String
import nltk
from nltk.corpus import stopwords # stopwords
from nltk.stem.porter import PorterStemmer # for word stemming
from nltk.stem import WordNetLemmatizer # for word lemmatization
import unicodedata
import html

nltk.download('stopwords')
nltk.download('wordnet')

# set of stopwords to be removed from text
stop = set(stopwords.words('english'))

# update stopwords to have punctuation too
stop.update(list(string.punctuation))

def clean_text(text):
    
    # Remove unwanted html characters
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
    'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
    '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
    ' @-@ ', '-').replace('\\', ' \\ ')
    text = re1.sub(' ', html.unescape(x1))
    
    # remove non-ascii characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # remove between square brackets
    text = re.sub('\[[^]]*\]', '', text)
    
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # remove twitter tags
    text = text.replace("@", "")
    
    # remove hashtags
    text = text.replace("#", "")
    
    # remove all non-alphabetic characters
    text = re.sub(r'[^a-zA-Z ]', '', text)
    
    # remove stopwords from text
    final_text = []
    for word in text.split():
        if word.strip().lower() not in stop:
            final_text.append(word.strip().lower())
    
    text = " ".join(final_text)
    
    # lemmatize words
    lemmatizer = WordNetLemmatizer()    
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    text = " ".join([lemmatizer.lemmatize(word, pos = 'v') for word in text.split()])
    
    # replace all numbers with "num"
    text = re.sub("\d", "num", text)
    
    return text.lower()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Apply "clean" function to the 'comment_text' column of the dataset

In [None]:
dataset_train = dataset_train.map(lambda example: {'comment_text': clean_text(example['comment_text'])})

HBox(children=(FloatProgress(value=0.0, max=1624386.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=180488.0), HTML(value='')))




Create tokenized dataset

In [None]:
tokenized_dataset = dataset_train.map(lambda batch: tokenizer(batch["comment_text"], truncation=True, padding="max_length", max_length=128), batched=True, \
                  remove_columns=['id', 'comment_text', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual', 'black', \
                                  'buddhist', 'christian', 'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability', \
                                  'jewish', 'latino', 'male', 'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity', 'other_religion', 'other_sexual_orientation',\
                                  'physical_disability', 'psychiatric_or_mental_illness', 'target', 'transgender', 'white', 'created_date', \
                                  'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'sexual_explicit',\
                                  'identity_annotator_count', 'toxicity_annotator_count'])

HBox(children=(FloatProgress(value=0.0, max=1625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=181.0), HTML(value='')))




In [None]:
tokenized_dataset["train"].features

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'labels': ClassLabel(num_classes=2, names=['non toxic', 'toxic'], names_file=None, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

Set the torch format for the dataset attributes

In [None]:
tokenized_dataset.set_format("torch")

Choose the transformer model

In [None]:
model_checkpoint = "bert-base-uncased"

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading the testing data: Davidson et al 2017 data

In [None]:
import pandas as pd

# Loading t-davidson raw dataset directly from the github

url = 'https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv'
test_davidson = pd.read_csv(url)

from datasets import load_dataset
from datasets import Dataset

test_davidson = Dataset.from_pandas(test_davidson)

test_davidson = test_davidson.filter(lambda example: example['class'] != 1)

test_davidson = test_davidson.map(lambda example: {'labels': 1 if example['class'] == 0 else 0})

from datasets import ClassLabel, Value

new_features = test_davidson.features.copy()

new_features["labels"] = ClassLabel(names=['non toxic', 'toxic'])  

test_davidson = test_davidson.cast(new_features)

test_davidson = test_davidson.map(lambda example: {'tweet': clean_text(example['tweet'])})

HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5593.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5593.0), HTML(value='')))




In [None]:
test1_tokenized_dataset = test_davidson.map(lambda batch: tokenizer(batch["tweet"], truncation=True, padding="max_length", max_length=128), batched=True, \
                  remove_columns=['count', 'hate_speech', 'neither', 'offensive_language', 'Unnamed: 0', 'class', 'tweet'])

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [None]:
test1_tokenized_dataset.set_format("torch")

Downloading the testing data: HASOC 2019

In [None]:
from google.colab import files
uploaded = files.upload()

Saving english_dataset.tsv to english_dataset.tsv


In [None]:
import io

test_hasoc = pd.read_csv(io.BytesIO(uploaded['english_dataset.tsv']), sep='\t')

In [None]:
test_hasoc = Dataset.from_pandas(test_hasoc)

test_hasoc = test_hasoc.filter(lambda example: example['task_2'] != 'PRFN' and example['task_2'] != 'OFFN')

test_hasoc = test_hasoc.map(lambda example: {'labels': 1 if example['task_2'] == 'HATE' else 0})

new_features = test_hasoc.features.copy()

new_features["labels"] = ClassLabel(names=['non toxic', 'toxic'])  

test_hasoc = test_hasoc.cast(new_features)

test_hasoc = test_hasoc.map(lambda example: {'text': clean_text(example['text'])})

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4734.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4734.0), HTML(value='')))




In [None]:
test2_tokenized_dataset = test_hasoc.map(lambda batch: tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128), batched=True, \
                  remove_columns=['task_1', 'task_2', 'task_3', 'text_id', 'text'])

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [None]:
test2_tokenized_dataset.set_format("torch")

In [None]:
def create_dataloaders(train_batch_size=8, eval_batch_size=8, test1_batch_size=8, test2_batch_size=8):
    train_dataloader = DataLoader(
        tokenized_dataset["train"], shuffle=True, batch_size=train_batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_dataset["test"], shuffle=False, batch_size=eval_batch_size
    )
    test1_dataloader = DataLoader(
        test1_tokenized_dataset, shuffle=False, batch_size=test1_batch_size
    )
    test2_dataloader = DataLoader(
        test2_tokenized_dataset, shuffle=False, batch_size=test2_batch_size
    )
    return train_dataloader, eval_dataloader, test1_dataloader, test2_dataloader

In [None]:
train_dataloader, eval_dataloader, test1_dataloader, test2_dataloader = create_dataloaders()

In [None]:
for batch in train_dataloader:
    print({k: v.shape for k, v in batch.items()})
    outputs = model(**batch)
    break

{'attention_mask': torch.Size([8, 128]), 'input_ids': torch.Size([8, 128]), 'labels': torch.Size([8]), 'token_type_ids': torch.Size([8, 128])}


In [None]:
outputs

SequenceClassifierOutput([('loss', tensor(0.6305, grad_fn=<NllLossBackward>)),
                          ('logits', tensor([[-0.0451, -0.2723],
                                   [-0.0321, -0.3555],
                                   [ 0.0321, -0.2354],
                                   [-0.0153, -0.2503],
                                   [-0.0466, -0.1603],
                                   [-0.0643, -0.2403],
                                   [-0.0137, -0.2166],
                                   [ 0.0444, -0.3654]], grad_fn=<AddmmBackward>))])

In [None]:
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.

%%file NewMetric.py

import datasets
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score

# TODO: Add BibTeX citation
_CITATION = """  """

# TODO: Add description of the metric here
_DESCRIPTION = """  """


# TODO: Add description of the arguments of the metric here
_KWARGS_DESCRIPTION = """  """

@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class NewMetric(datasets.Metric):
    """TODO: Short description of my metric."""

    def _info(self):
        # TODO: Specifies the datasets.MetricInfo object
        return datasets.MetricInfo(
            # This is the description that will appear on the metrics page.
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=datasets.Features({
                'predictions': datasets.Value('int64'),
                'references': datasets.Value('int64'),
            }),
      )

    def _compute(self, predictions, references):

        precision, recall, f1, _ = precision_recall_fscore_support(references, predictions, average='binary')
        acc = accuracy_score(references, predictions)

        try:
            auroc = roc_auc_score(references, predictions)
        except ValueError:
            pass
            auroc = 0.5

        return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auroc': auroc
        } 

Writing NewMetric.py


In [None]:
metric = load_metric('/content/NewMetric.py')

In [None]:
predictions = outputs.logits.detach().argmax(dim=-1)
metric.compute(predictions=predictions, references=batch["labels"])

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.875, 'auroc': 0.5, 'f1': 0.0, 'precision': 0.0, 'recall': 0.0}

In [None]:
hyperparameters = {
    "learning_rate": 1e-5,
    "num_epochs": 3,
    "train_batch_size": 8, # Actual batch size will this x 8
    "eval_batch_size": 8, # Actual batch size will this x 8
    "test1_batch_size": 8, # Actual batch size will this x 8
    "test2_batch_size": 8, # Actual batch size will this x 8
    "seed": 42,
}

In [None]:
import transformers

def training_function():
    # Initialize accelerator
    accelerator = Accelerator()

    # To have only one message (and not 8) per logs of Transformers or Datasets, we set the logging verbosity
    # to INFO for the main process only.
    if accelerator.is_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    train_dataloader, eval_dataloader, test1_dataloader, test2_dataloader = create_dataloaders(
        train_batch_size=hyperparameters["train_batch_size"], eval_batch_size=hyperparameters["eval_batch_size"],
        test1_batch_size=hyperparameters["test1_batch_size"], test2_batch_size=hyperparameters["test2_batch_size"]
    )
    # The seed need to be set before we instantiate the model, as it will determine the random head.
    set_seed(hyperparameters["seed"])

    # Instantiate the model, let Accelerate handle the device placement.
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=hyperparameters["learning_rate"])

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, test1_dataloader, test2_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, test1_dataloader, test2_dataloader
    )

    num_epochs = hyperparameters["num_epochs"]
    # Instantiate learning rate scheduler after preparing the training dataloader as the prepare method
    # may change its length.
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=500,
        num_training_steps=len(train_dataloader) * num_epochs,
    )

    # Instantiate a progress bar to keep track of training. Note that we only enable it on the main
    # process to avoid having 8 progress bars.
    progress_bar = tqdm(range(num_epochs * len(train_dataloader)), disable=not accelerator.is_main_process)

    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        model.eval()
        all_predictions = []
        all_labels = []

        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)

            # We gather predictions and labels from the 8 TPUs to have them all.
            all_predictions.append(accelerator.gather(predictions))
            all_labels.append(accelerator.gather(batch["labels"]))

        # Concatenate all predictions and labels.
        # The last thing we need to do is to truncate the predictions and labels we concatenated
        # together as the prepared evaluation dataloader has a little bit more elements to make
        # batches of the same size on each process.
        all_predictions = torch.cat(all_predictions)[:len(tokenized_dataset["test"])]
        all_labels = torch.cat(all_labels)[:len(tokenized_dataset["test"])]

        eval_metric = metric.compute(predictions=all_predictions, references=all_labels)

        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)


    # Now we test the model on t-davidson
    model.eval()
    all_predictions_1 = []
    all_labels_1 = []

    for batch in test1_dataloader:
      with torch.no_grad():
        outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)

        # We gather predictions and labels from the 8 TPUs to have them all.
        all_predictions_1.append(accelerator.gather(predictions))
        all_labels_1.append(accelerator.gather(batch["labels"]))

        # Concatenate all predictions and labels.
        # The last thing we need to do is to truncate the predictions and labels we concatenated
        # together as the prepared evaluation dataloader has a little bit more elements to make
        # batches of the same size on each process.
    all_predictions_1 = torch.cat(all_predictions_1)[:len(test1_tokenized_dataset)]
    all_labels_1 = torch.cat(all_labels_1)[:len(test1_tokenized_dataset)]

    eval_metric_1 = metric.compute(predictions=all_predictions_1, references=all_labels_1)

    # Use accelerator.print to print only on the main process.
    accelerator.print(f"Davidson et al 2017 data:", eval_metric_1) 

        # Now we test the model on hasoc
    all_predictions_2 = []
    all_labels_2 = []

    for batch in test2_dataloader:
      with torch.no_grad():
        outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)

        # We gather predictions and labels from the 8 TPUs to have them all.
        all_predictions_2.append(accelerator.gather(predictions))
        all_labels_2.append(accelerator.gather(batch["labels"]))

        # Concatenate all predictions and labels.
        # The last thing we need to do is to truncate the predictions and labels we concatenated
        # together as the prepared evaluation dataloader has a little bit more elements to make
        # batches of the same size on each process.
    all_predictions_2 = torch.cat(all_predictions_2)[:len(test2_tokenized_dataset)]
    all_labels_2 = torch.cat(all_labels_2)[:len(test2_tokenized_dataset)]

    eval_metric_2 = metric.compute(predictions=all_predictions_2, references=all_labels_2)

    # Use accelerator.print to print only on the main process.
    accelerator.print(f"HASOC data:", eval_metric_2) 

In [None]:
%%time

from accelerate import notebook_launcher
 
notebook_launcher(training_function)

Launching a training on 8 TPU cores.


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/tra

HBox(children=(FloatProgress(value=0.0, max=76146.0), HTML(value='')))

epoch 0: {'accuracy': 0.9501185674393865, 'f1': 0.6678717674401448, 'precision': 0.7125314861460957, 'recall': 0.6284801777407485, 'auroc': 0.803245718517844}
epoch 1: {'accuracy': 0.9511324852621781, 'f1': 0.6662125340599455, 'precision': 0.7322186174195159, 'recall': 0.6111226827744219, 'auroc': 0.7958705204220424}
