In [1]:
from transformers import pipeline

In [2]:
classifier = pipeline(task='sentiment-analysis')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [8]:
classifier("i liked it or not. I am not sure. No way! Maybe I loved it! Yes i like it")

[{'label': 'POSITIVE', 'score': 0.9991247057914734}]

In [9]:
from transformers import AutoTokenizer

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(encoding)

{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [12]:
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt", # for pytorch
)
pt_batch

{'input_ids': tensor([[  101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103,   100,
         58263, 13299,   119,   102],
        [  101, 11312, 18763, 10855, 11530,   112,   162, 39487, 10197,   119,
           102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [14]:
from transformers import AutoModelForSequenceClassification

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype="auto")

In [16]:
{**pt_batch}

{'input_ids': tensor([[  101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103,   100,
          58263, 13299,   119,   102],
         [  101, 11312, 18763, 10855, 11530,   112,   162, 39487, 10197,   119,
            102,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [21]:
pt_outputs = pt_model(**pt_batch)
pt_outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-2.6222, -2.7745, -0.8967,  2.0137,  3.3064],
        [ 0.0064, -0.1258, -0.0503, -0.1655,  0.1329]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [22]:
from torch import nn

pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
print(pt_predictions)

tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)


# Trainer Class

In [23]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="output/",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
)

In [25]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [26]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT

Generating train split: 100%|██████████| 8530/8530 [00:00<00:00, 792956.69 examples/s]
Generating validation split: 100%|██████████| 1066/1066 [00:00<00:00, 473987.92 examples/s]
Generating test split: 100%|██████████| 1066/1066 [00:00<00:00, 482791.07 examples/s]


In [34]:
def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])

In [35]:
dataset = dataset.map(tokenize_dataset, batched=True)

Map: 100%|██████████| 8530/8530 [00:00<00:00, 58881.52 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 63211.34 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 60428.00 examples/s]


In [36]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [38]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
)  # doctest: +SKIP

In [39]:
trainer.train()

Step,Training Loss
500,0.4421
1000,0.3904
1500,0.2643
2000,0.2639


TrainOutput(global_step=2134, training_loss=0.33386932809216574, metrics={'train_runtime': 211.7711, 'train_samples_per_second': 80.559, 'train_steps_per_second': 10.077, 'total_flos': 195974132394480.0, 'train_loss': 0.33386932809216574, 'epoch': 2.0})

In [40]:
trained_model = trainer.model
trained_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [42]:
import torch 

text = "Hugging Face makes NLP easy!"
inputs = tokenizer(text, return_tensors="pt")
device = torch.device("mps" if torch.cuda.is_available() else "cpu")
trainer.model.to(device)

inputs = {k: v.to(device) for k, v in inputs.items()}

# Run inference
with torch.no_grad():
    outputs = trainer.model(**inputs)

# Get predicted logits
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1).item()

print(f"Predicted class: {predicted_class}")

Predicted class: 1


In [43]:
logits

tensor([[-2.0775,  2.2594]])

In [49]:
from transformers import pipeline, AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

# Create a pipeline using the trained model
nlp_pipeline = pipeline("sentiment-analysis", model=trainer.model, tokenizer=tokenizer)

# Perform inference
text = "I dont wanna do it"
result = nlp_pipeline(text)

print(result)  # Output: [{'label': 'LABEL_1', 'score': 0.98}]


Device set to use mps:0


[{'label': 'LABEL_0', 'score': 0.9871347546577454}]
