In [17]:
from datasets import load_dataset
dataset = load_dataset("yelp_review_full")

In [18]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("Merged_yml")

In [19]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True,max_length=1024)

In [20]:
tokenized_yelp = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [21]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
import evaluate
accuracy = evaluate.load("accuracy")

In [23]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [24]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    "Merged_yml", num_labels=5
)
model.config.pad_token_id = tokenizer.eos_token_id
model.resize_token_embeddings(len(tokenizer))

Downloading shards:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding(50257, 768)

In [25]:
small_train_dataset = tokenized_yelp["train"].shuffle(seed=42).select(range(300000))
small_eval_dataset = tokenized_yelp["test"].shuffle(seed=42).select(range(50000))

In [26]:
trainer = Trainer(
    model=model,                         
    eval_dataset=small_eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [27]:
eval_result = trainer.evaluate(eval_dataset=small_eval_dataset)

In [29]:
print(eval_result)

{'eval_loss': 0.7602638006210327, 'eval_accuracy': 0.67204, 'eval_runtime': 586.3478, 'eval_samples_per_second': 85.274, 'eval_steps_per_second': 10.659}
