In [68]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
import torch


In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

# Machine Learning Model

In [24]:

# Change target to one hot encoding
target = pd.get_dummies(dataset["target"]).values

train_data, test_data, train_target, test_target = train_test_split(dataset["data"], target, test_size=0.2, random_state=42, shuffle=True)

In [52]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced"))),
])

pipeline.fit(train_data, train_target)

train_predictions = pipeline.predict(train_data)
test_predictions = pipeline.predict(test_data)

print("Train classification report:")
print(classification_report(train_target, train_predictions, target_names=dataset["target_names"]))
print("Test classification report:")
print(classification_report(test_target, test_predictions, target_names=dataset["target_names"]))

Train classification report:
                          precision    recall  f1-score   support

             alt.atheism       0.98      0.98      0.98       389
           comp.graphics       0.97      0.97      0.97       453
 comp.os.ms-windows.misc       0.94      0.96      0.95       459
comp.sys.ibm.pc.hardware       0.95      0.98      0.97       463
   comp.sys.mac.hardware       0.97      0.97      0.97       458
          comp.windows.x       0.98      0.99      0.99       474
            misc.forsale       0.96      0.98      0.97       481
               rec.autos       0.96      0.95      0.95       475
         rec.motorcycles       0.99      0.97      0.98       472
      rec.sport.baseball       0.99      0.97      0.98       486
        rec.sport.hockey       0.99      0.97      0.98       487
               sci.crypt       1.00      0.98      0.99       476
         sci.electronics       0.97      0.97      0.97       462
                 sci.med       1.00      0.97 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Transformers model

In [81]:
# Fine tune a transformers model for text classification
# Path: test_classification.ipynb
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
import numpy as np


id2label = {id: label for id, label in enumerate(dataset["target_names"])}
label2id = {label: id for id, label in enumerate(dataset["target_names"])}
# Load the transformers model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", id2label=id2label, label2id=label2id,
    problem_type="multi_class_classification", num_labels=len(dataset["target_names"]))


loading configuration file config.json from cache at /Users/benjamin.breton@loreal.com/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/benjamin.breton@loreal.com/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b

RuntimeError: PyTorch is not linked with support for mps devices

In [64]:
model.forward()

  0%|          | 0/3396 [1:39:16<?, ?it/s]
  0%|          | 0/3396 [30:26<?, ?it/s]
  0%|          | 0/3396 [08:34<?, ?it/s]


ValueError: You have to specify either input_ids or inputs_embeds

In [74]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return f1_score(y_true=labels, y_pred=predictions, average="macro")

# Create the train dataset class for classification
class NewsGroupDataset(torch.utils.data.Dataset):
    def __init__(self, data, target, tokenizer, max_len=512):
        self.data = data
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data[index]
        target = self.target[index]

        encoding = self.tokenizer.encode_plus(
            data,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "data": data,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(target, dtype=torch.long),
        }

# Create the training arguments for the trainer in the multi-class classification
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=NewsGroupDataset(data=train_data, target=train_target, tokenizer=tokenizer),
    eval_dataset=NewsGroupDataset(data=test_data, target=test_target, tokenizer=tokenizer),
    compute_metrics=compute_metrics,

)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [75]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [76]:
trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=NewsGroupDataset(data=train_data, target=train_target, tokenizer=tokenizer),
    eval_dataset=NewsGroupDataset(data=test_data, target=test_target, tokenizer=tokenizer),
    compute_metrics=compute_metrics,

)

In [79]:
trainer.train()

***** Running training *****
  Num examples = 9051
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1698
  Number of trainable parameters = 109497620
  0%|          | 7/1698 [02:29<10:02:56, 21.39s/it]
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: data. If data are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


KeyboardInterrupt: 