In [1]:
import transformers
import torch
import datasets
import pandas as pd
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [30]:
MODEL_NAME = "bert-base-uncased"
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
model = transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(id2label), id2label=id2label, label2id=label2id)
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# ner = transformers.pipeline("ner", model=model, tokenizer=tokenizer)
# print(ner("Hugging Face is creating a tool that democratizes AI."))


In [16]:
df = pd.read_csv(f"data/yelp_labelled.txt", sep="\t", header=None, names=["text", "label"])
print(df.shape)
df.head(2)


(1000, 2)


Unnamed: 0,text,label
0,Wow... Loved this place.,1
1,Crust is not good.,0


In [17]:
train_x, test_x, train_y, test_y = train_test_split(
    df["text"].values, df["label"].values, test_size=0.2, random_state=42
)
train_dataset = datasets.Dataset.from_dict({"text": train_x, "label": train_y})
test_dataset = datasets.Dataset.from_dict({"text": test_x, "label": test_y})
train_dataset = train_dataset.map(
    lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=16),
    batched=True,
)
test_dataset = test_dataset.map(
    lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=16),
    batched=True,
)


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map: 100%|██████████| 800/800 [00:00<00:00, 5278.43 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 4627.58 examples/s]


In [18]:
print(train_dataset[0])

{'text': 'The worst was the salmon sashimi.', 'label': 0, 'input_ids': [101, 1996, 5409, 2001, 1996, 11840, 24511, 27605, 1012, 102, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}


In [19]:
print(tokenizer.decode(train_dataset[0]["input_ids"], skip_special_tokens=True))
print([tokenizer.decode(id) for id in train_dataset[0]["input_ids"]])

the worst was the salmon sashimi.
['[CLS]', 'the', 'worst', 'was', 'the', 'salmon', 'sash', '##imi', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length", max_length=16)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=5,
    max_steps=10,
)
# play with these parameters to see how they affect the training
# e.g., change max_steps to 1000, increase batch_size, etc.
# learn about these parameters here: https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()},
)
trainer.train()
trainer.save_model("./model")




Step,Training Loss,Validation Loss,Accuracy
5,No log,0.682736,0.535
10,No log,0.664371,0.65


In [28]:
pipe = transformers.pipeline("sentiment-analysis", model="./model", tokenizer=tokenizer, )
print(pipe("I love using Hugging Face transformers!"))
print(pipe("I hate using Hugging Face transformers!"))

Device set to use cpu


[{'label': 'NEGATIVE', 'score': 0.564267635345459}]
[{'label': 'NEGATIVE', 'score': 0.6015160083770752}]


In [33]:
pipe = transformers.pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, )
print(pipe("I love using Hugging Face transformers!"))
print(pipe("I hate using Hugging Face transformers!"))

Device set to use cpu


[{'label': 'NEGATIVE', 'score': 0.5653225183486938}]
[{'label': 'NEGATIVE', 'score': 0.5658664703369141}]


In [34]:
text = "I love using Hugging Face transformers!"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
predicted_class_id = logits.argmax().item()
predicted_class = id2label[predicted_class_id]
print(f"Text: {text}\nPredicted class ID: {predicted_class_id}\nPredicted class: {predicted_class}")


Text: I love using Hugging Face transformers!
Predicted class ID: 0
Predicted class: NEGATIVE


In [35]:
logits

tensor([[ 0.0581, -0.2047]], grad_fn=<AddmmBackward0>)