In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

In [None]:
dataset = load_dataset("G:/AI/AI2/2-dataset/data/data.py",name="default")
dataset

In [3]:
import torch

tokenizer = AutoTokenizer.from_pretrained('G:/Model/bert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)
tokenized_datasets = dataset.map(tokenize, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 22500
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2500
    })
})

In [22]:
model = AutoModelForSequenceClassification.from_pretrained('G:/Model/bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at G:/Model/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=32,  # *** 梯度累加 ***
    gradient_checkpointing=True,     # *** 梯度检查点 ***
    optim="adafactor",               # *** adafactor优化器 *** 
    per_device_eval_batch_size=8,
    logging_steps=10,
    num_train_epochs=1,
    weight_decay=0.01,
    report_to="none",
    load_best_model_at_end=True,
)

CLASS_NAME = {0: "negative", 1: "positive"}

In [31]:
from transformers import DataCollatorWithPadding

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

In [32]:
trainer.train()

  0%|          | 0/87 [00:00<?, ?it/s]

{'loss': 0.6237, 'learning_rate': 1.770114942528736e-05, 'epoch': 0.11}
{'loss': 0.4835, 'learning_rate': 1.540229885057471e-05, 'epoch': 0.23}
{'loss': 0.3975, 'learning_rate': 1.310344827586207e-05, 'epoch': 0.34}
{'loss': 0.336, 'learning_rate': 1.0804597701149427e-05, 'epoch': 0.46}
{'loss': 0.2949, 'learning_rate': 8.505747126436782e-06, 'epoch': 0.57}
{'loss': 0.2697, 'learning_rate': 6.206896551724138e-06, 'epoch': 0.68}
{'loss': 0.263, 'learning_rate': 3.908045977011495e-06, 'epoch': 0.8}
{'loss': 0.2705, 'learning_rate': 1.6091954022988506e-06, 'epoch': 0.91}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.24436816573143005, 'eval_runtime': 52.9884, 'eval_samples_per_second': 47.18, 'eval_steps_per_second': 5.907, 'epoch': 0.99}
{'train_runtime': 1937.0624, 'train_samples_per_second': 11.616, 'train_steps_per_second': 0.045, 'train_loss': 0.35667962726505326, 'epoch': 0.99}


TrainOutput(global_step=87, training_loss=0.35667962726505326, metrics={'train_runtime': 1937.0624, 'train_samples_per_second': 11.616, 'train_steps_per_second': 0.045, 'train_loss': 0.35667962726505326, 'epoch': 0.99})

In [33]:
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')

('./sentiment_model\\tokenizer_config.json',
 './sentiment_model\\special_tokens_map.json',
 './sentiment_model\\vocab.txt',
 './sentiment_model\\added_tokens.json',
 './sentiment_model\\tokenizer.json')

In [34]:
test_reviews = [
    "I absolutely loved this movie! The storyline was captivating and the acting was top-notch. A must-watch for everyone.",
    "This movie was a complete waste of time. The plot was predictable and the characters were poorly developed.",
    "An excellent film with a heartwarming story. The performances were outstanding, especially the lead actor.",
    "I found the movie to be quite boring. It dragged on and didn't really go anywhere. Not recommended.",
    "A masterpiece! The director did an amazing job bringing this story to life. The visuals were stunning.",
    "Terrible movie. The script was awful and the acting was even worse. I can't believe I sat through the whole thing.",
    "A delightful film with a perfect mix of humor and drama. The cast was great and the dialogue was witty.",
    "I was very disappointed with this movie. It had so much potential, but it just fell flat. The ending was particularly bad.",
    "One of the best movies I've seen this year. The story was original and the performances were incredibly moving.",
    "I didn't enjoy this movie at all. It was confusing and the pacing was off. Definitely not worth watching."
]


id2_label = {0: "nagetive！", 1: "positive！"}
model.eval()
for sen in test_reviews:
    with torch.inference_mode():
        inputs = tokenizer(sen, return_tensors="pt")
        inputs = {k: v.cuda() for k, v in inputs.items()}
        logits = model(**inputs).logits
        pred = torch.argmax(logits, dim=-1)
        print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

输入：I absolutely loved this movie! The storyline was captivating and the acting was top-notch. A must-watch for everyone.
模型预测结果:positive！
输入：This movie was a complete waste of time. The plot was predictable and the characters were poorly developed.
模型预测结果:nagetive！
输入：An excellent film with a heartwarming story. The performances were outstanding, especially the lead actor.
模型预测结果:positive！
输入：I found the movie to be quite boring. It dragged on and didn't really go anywhere. Not recommended.
模型预测结果:nagetive！
输入：A masterpiece! The director did an amazing job bringing this story to life. The visuals were stunning.
模型预测结果:positive！
输入：Terrible movie. The script was awful and the acting was even worse. I can't believe I sat through the whole thing.
模型预测结果:nagetive！
输入：A delightful film with a perfect mix of humor and drama. The cast was great and the dialogue was witty.
模型预测结果:positive！
输入：I was very disappointed with this movie. It had so much potential, but it just fell flat. The ending w

In [35]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [36]:
for sen in test_reviews:
    print(pipe(sen))

[{'label': 'positive！', 'score': 0.942859411239624}]
[{'label': 'nagetive！', 'score': 0.9542161822319031}]
[{'label': 'positive！', 'score': 0.9366816282272339}]
[{'label': 'nagetive！', 'score': 0.9638954997062683}]
[{'label': 'positive！', 'score': 0.9409827589988708}]
[{'label': 'nagetive！', 'score': 0.9593591690063477}]
[{'label': 'positive！', 'score': 0.9369943737983704}]
[{'label': 'nagetive！', 'score': 0.9648497104644775}]
[{'label': 'positive！', 'score': 0.9420958757400513}]
[{'label': 'nagetive！', 'score': 0.9416175484657288}]
