In [1]:
#!pip install transformers codecarbon --user
!pip install transformers[onnx] codecarbon --user



In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report, f1_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from copy import deepcopy
import torch


from codecarbon import OfflineEmissionsTracker

import onnxruntime as ort

In [3]:
# Configure emission tracker
train_tracker = OfflineEmissionsTracker(
    cloud_provider="gcp", 
    cloud_region="europe-west1", 
    output_file="emissions_training_ml.csv",
    project_name="deep_learning_classifier_training",
)

[codecarbon INFO @ 23:49:41] offline tracker init
[codecarbon INFO @ 23:49:41] [setup] RAM Tracking...
[codecarbon INFO @ 23:49:41] [setup] GPU Tracking...
[codecarbon INFO @ 23:49:41] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 23:49:41] [setup] CPU Tracking...
[codecarbon INFO @ 23:49:43] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.30GHz
[codecarbon INFO @ 23:49:43] >>> Tracker's metadata:
[codecarbon INFO @ 23:49:43]   Platform system: Linux-4.19.0-22-cloud-amd64-x86_64-with-debian-bullseye-sid
[codecarbon INFO @ 23:49:43]   Python version: 3.7.12
[codecarbon INFO @ 23:49:43]   Available RAM : 14.681 GB
[codecarbon INFO @ 23:49:43]   CPU count: 4
[codecarbon INFO @ 23:49:43]   CPU model: Intel(R) Xeon(R) CPU @ 2.30GHz
[codecarbon INFO @ 23:49:43]   GPU count: 1
[codecarbon INFO @ 23:49:43]   GPU model: 1 x Tesla T4


# Load dataset

In [4]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

# Transformers model

In [5]:
id2label = {id: label for id, label in enumerate(newsgroups_train.target_names)}
label2id = {label: id for id, label in enumerate(newsgroups_train.target_names)}

# Load the transformers model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", id2label=id2label, label2id=label2id,
    problem_type="single_label_classification", num_labels=len(newsgroups_train.target_names))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
def compute_metrics(eval_pred):
    #F1 score with macro average
    predictions, labels = eval_pred
    # Set the max prediction to 1 and the rest to 0
    predictions = np.argmax(predictions, axis=1)
    labels = np.argmax(labels, axis=1)
    return {"f1": f1_score(y_true=labels, y_pred=predictions, average="macro")}

# Create the train dataset class for classification
class NewsGroupDataset(torch.utils.data.Dataset):
    def __init__(self, data, target, tokenizer, max_len=512):
        self.data = data
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data[index]
        target = self.target[index]

        encoding = self.tokenizer.encode_plus(
            data,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "data": data,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(target, dtype=torch.long),
        }

# Create the training arguments for the trainer in the multi-class classification
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=1,
    evaluation_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

In [7]:
# Configure the trainer
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss
    
trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=NewsGroupDataset(
        data=newsgroups_train.data, 
        target=pd.get_dummies(newsgroups_train.target).values, 
        tokenizer=tokenizer
    ),
    eval_dataset=NewsGroupDataset(
        data=newsgroups_test.data, 
        target=pd.get_dummies(newsgroups_test.target).values, 
        tokenizer=tokenizer
    ),
    compute_metrics=compute_metrics,
)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
train_tracker.start()
trainer.train()
train_tracker.stop()
print(f"Total energy consumed for training: {train_tracker.final_emissions_data.energy_consumed}")

In [None]:
inference_tracker = OfflineEmissionsTracker(
    cloud_provider="gcp", 
    cloud_region="europe-west1", 
    output_file="emissions_predict_deep.csv",
    project_name="deep_classifier_prediction",
)

predicted_sentence = deepcopy([newsgroups_test.data[0]])
inference_tracker.start()
for i in range(int(10)):
    model(predicted_sentence)
inference_tracker.stop()

In [None]:
tokenized_data = tokenizer(newsgroups_test.data[:1], padding=True, truncation=True, return_tensors="pt")["input_ids"].to("cuda")
# tokenized_data["size"] = len(newsgroups_test.data)
model(input_ids=tokenized_data)

In [None]:
!nvidia-smi