In [33]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("stanfordnlp/imdb")
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [26]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [46]:
#raw_datasets['train'] = raw_datasets['train'].select(range(23000, 24000))
raw_datasets['train'] = raw_datasets['train'].shuffle(seed=42).select(range(1000))
#raw_datasets['test'] = raw_datasets['test'].select(range(200))
raw_datasets['test'] = raw_datasets['test'].shuffle(seed=42).select(range(200))
raw_datasets['validate'] = raw_datasets['test'].shuffle(seed=42).select(range(200))

In [53]:
raw_datasets['validate'] = raw_datasets['test'].shuffle(seed=37).select(range(200))

In [35]:
train_labels = raw_datasets['train']['label']
print("Unique labels:", set(train_labels))
print("Label counts:", {label: train_labels.count(label) for label in set(train_labels)})

Unique labels: {0, 1}
Label counts: {0: 512, 1: 488}


In [37]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map: 100%|██████████| 1000/1000 [00:00<00:00, 12305.06 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 12024.95 examples/s]


In [54]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map: 100%|██████████| 200/200 [00:00<00:00, 10449.97 examples/s]


In [38]:
train_dataset = raw_datasets["train"]
print(train_dataset[2:4])

{'text': ['George P. Cosmatos\' "Rambo: First Blood Part II" is pure wish-fulfillment. The United States clearly didn\'t win the war in Vietnam. They caused damage to this country beyond the imaginable and this movie continues the fairy story of the oh-so innocent soldiers. The only bad guys were the leaders of the nation, who made this war happen. The character of Rambo is perfect to notice this. He is extremely patriotic, bemoans that US-Americans didn\'t appreciate and celebrate the achievements of the single soldier, but has nothing but distrust for leading officers and politicians. Like every film that defends the war (e.g. "We Were Soldiers") also this one avoids the need to give a comprehensible reason for the engagement in South Asia. And for that matter also the reason for every single US-American soldier that was there. Instead, Rambo gets to take revenge for the wounds of a whole nation. It would have been better to work on how to deal with the memories, rather than suppress

In [39]:
print(train_dataset[800:1000]['label'])

[1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1]


In [40]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [41]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    processing_class=tokenizer,
)

In [43]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=375, training_loss=0.2925295613606771, metrics={'train_runtime': 750.786, 'train_samples_per_second': 3.996, 'train_steps_per_second': 0.499, 'total_flos': 368950682500416.0, 'train_loss': 0.2925295613606771, 'epoch': 3.0})

TrainOutput(global_step=375, training_loss=0.2925295613606771, metrics={'train_runtime': 750.786, 'train_samples_per_second': 3.996, 'train_steps_per_second': 0.499, 'total_flos': 368950682500416.0, 'train_loss': 0.2925295613606771, 'epoch': 3.0})
{'accuracy': {'accuracy': 0.845}, 'precision': {'precision': 0.8155339805825242}, 'recall': {'recall': 0.875}, 'f1': {'f1': 0.8442211055276382}}

In [36]:
# Get some test examples
test_examples = raw_datasets["test"].select(range(5))  # First 5 test examples

# Make predictions
predictions = trainer.predict(tokenized_datasets["test"].select(range(5)))
predicted_labels = predictions.predictions.argmax(-1)

# Print results
for i, example in enumerate(test_examples):
    actual_label = "Positive" if example["label"] == 1 else "Negative"
    predicted_label = "Positive" if predicted_labels[i] == 1 else "Negative"
    
    print(f"Review: {example['text'][:100]}...")  # First 100 characters
    print(f"Actual: {actual_label}")
    print(f"Predicted: {predicted_label}")
    print("-" * 50)

Review: <br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining Kin...
Actual: Positive
Predicted: Negative
--------------------------------------------------
Review: This is the latest entry in the long series of films with the French agent, O.S.S. 117 (the French a...
Actual: Positive
Predicted: Negative
--------------------------------------------------
Review: This movie was so frustrating. Everything seemed energetic and I was totally prepared to have a good...
Actual: Negative
Predicted: Negative
--------------------------------------------------
Review: I was truly and wonderfully surprised at "O' Brother, Where Art Thou?" The video store was out of al...
Actual: Positive
Predicted: Negative
--------------------------------------------------
Review: This movie spends most of its time preaching that it is the script that makes the movie, but apparen...
Actual: Negative
Predicted: Negative
--------------------------------------------------


In [44]:
# Get some test examples
test_examples = raw_datasets["test"].select(range(5))  # First 5 test examples

# Make predictions
predictions = trainer.predict(tokenized_datasets["test"].select(range(5)))
predicted_labels = predictions.predictions.argmax(-1)

# Print results
for i, example in enumerate(test_examples):
    actual_label = "Positive" if example["label"] == 1 else "Negative"
    predicted_label = "Positive" if predicted_labels[i] == 1 else "Negative"
    
    print(f"Review: {example['text'][:100]}...")  # First 100 characters
    print(f"Actual: {actual_label}")
    print(f"Predicted: {predicted_label}")
    print("-" * 50)

Review: <br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining Kin...
Actual: Positive
Predicted: Positive
--------------------------------------------------
Review: This is the latest entry in the long series of films with the French agent, O.S.S. 117 (the French a...
Actual: Positive
Predicted: Positive
--------------------------------------------------
Review: This movie was so frustrating. Everything seemed energetic and I was totally prepared to have a good...
Actual: Negative
Predicted: Negative
--------------------------------------------------
Review: I was truly and wonderfully surprised at "O' Brother, Where Art Thou?" The video store was out of al...
Actual: Positive
Predicted: Positive
--------------------------------------------------
Review: This movie spends most of its time preaching that it is the script that makes the movie, but apparen...
Actual: Negative
Predicted: Negative
--------------------------------------------------


In [None]:
predictions = trainer.predict(tokenized_datasets["validate"])
print(predictions.predictions.shape, predictions.label_ids.shape)


(200, 2) (200,)


In [49]:
print(predictions)

PredictionOutput(predictions=array([[ 2.6657157e+00, -2.9747741e+00],
       [-2.9453065e+00,  2.2767632e+00],
       [ 2.9884570e+00, -3.2128837e+00],
       [ 2.7949965e+00, -3.0002401e+00],
       [-2.8481870e+00,  2.1684809e+00],
       [-2.9810469e+00,  2.2943504e+00],
       [ 3.0110753e+00, -3.2257469e+00],
       [ 2.7964444e+00, -3.0052059e+00],
       [ 2.7993841e+00, -3.0574789e+00],
       [ 2.4118650e+00, -2.5639350e+00],
       [ 5.0808156e-01, -8.0128598e-01],
       [-2.9015496e+00,  2.2048423e+00],
       [-2.1912918e+00,  1.6404479e+00],
       [-2.9122851e+00,  2.2361152e+00],
       [ 5.1250839e-01, -9.1401166e-01],
       [ 2.8740833e+00, -3.0308948e+00],
       [-2.9463322e+00,  2.2631781e+00],
       [-1.9188563e+00,  1.4704189e+00],
       [ 1.0887568e+00, -1.3652998e+00],
       [-2.5365207e+00,  1.9733919e+00],
       [ 2.6415060e+00, -2.9479024e+00],
       [-2.7221725e+00,  2.0173438e+00],
       [ 2.9029100e+00, -3.0746820e+00],
       [ 2.9352109e+00, -3.1

In [56]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [51]:
print(preds)

[0 1 0 0 1 1 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0
 0 1 0 1 1 1 0 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1
 0 0 1 0 1 1 0 1 0 1 0 0 1 0 0 0 1 1 0 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 1 0 0 0 1 1 0 0 1 1 0 1 1 1 0 1 1 0 1 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 0 1
 1 1 0 1 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 1 1 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1
 1 1 0 1 0 1 1 1 1 0 1 0 0 1 1]


In [None]:
import evaluate

trainer.train()

predictions = trainer.predict(tokenized_datasets["validate"])
print(predictions.predictions.shape, predictions.label_ids.shape)


preds = np.argmax(predictions.predictions, axis=-1)

# Load multiple metrics for binary classification
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Compute all metrics
results = {
    "accuracy": accuracy.compute(predictions=preds, references=predictions.label_ids),
    "precision": precision.compute(predictions=preds, references=predictions.label_ids),
    "recall": recall.compute(predictions=preds, references=predictions.label_ids),
    "f1": f1.compute(predictions=preds, references=predictions.label_ids)
}

print(results)


Downloading builder script: 4.20kB [00:00, 5.67MB/s]
Downloading builder script: 7.56kB [00:00, 6.18MB/s]
Downloading builder script: 7.38kB [00:00, 5.66MB/s]
Downloading builder script: 6.79kB [00:00, 5.58MB/s]

{'accuracy': {'accuracy': 0.845}, 'precision': {'precision': 0.8155339805825242}, 'recall': {'recall': 0.875}, 'f1': {'f1': 0.8442211055276382}}





In [60]:
# Compute all metrics
preds = np.argmax(predictions.predictions, axis=-1)

results = {
    "accuracy": accuracy.compute(predictions=preds, references=predictions.label_ids),
    "precision": precision.compute(predictions=preds, references=predictions.label_ids),
    "recall": recall.compute(predictions=preds, references=predictions.label_ids),
    "f1": f1.compute(predictions=preds, references=predictions.label_ids)
}

print(results)

{'accuracy': {'accuracy': 0.845}, 'precision': {'precision': 0.8155339805825242}, 'recall': {'recall': 0.875}, 'f1': {'f1': 0.8442211055276382}}


In [59]:

# Get predictions
predictions = trainer.predict(tokenized_datasets["test"])
predicted_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# Find wrong predictions
wrong_indices = []
for i, (pred, true) in enumerate(zip(predicted_labels, true_labels)):
    if pred != true:
        wrong_indices.append(i)

print(f"Found {len(wrong_indices)} wrong predictions out of {len(true_labels)}")

# Show first 5 wrong predictions
for i in wrong_indices[:5]:
    example = raw_datasets["test"][i]
    pred_label = "Positive" if predicted_labels[i] == 1 else "Negative"
    true_label = "Positive" if true_labels[i] == 1 else "Negative"
    
    print(f"\nExample {i}:")
    print(f"Text: {example['text'][:200]}...")  # First 200 chars
    print(f"True label: {true_label}")
    print(f"Predicted: {pred_label}")
    print("-" * 50)


Found 31 wrong predictions out of 200

Example 24:
Text: Purchased this film for one dollar and figured I could never go wrong, my big mistake was watching it. Enjoyed the acting of Ice-T and the rapping which gave lots of class to this film about Los Angel...
True label: Negative
Predicted: Positive
--------------------------------------------------

Example 30:
Text: A truly masterful piece of filmmaking. It managed to put me to sleep and to boggle my mind. So boring that it induces sleep and yet so ludicrous that it made me wonder how stuff like this gets made. A...
True label: Negative
Predicted: Positive
--------------------------------------------------

Example 31:
Text: I used to always love the bill because of its great script and characters, but lately i feel as though it has turned into an emotional type of soap. If you look at promotional pictures/posters of the ...
True label: Negative
Predicted: Positive
--------------------------------------------------

Example 32:
Text

In [69]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    
    # Learning rate tuning
    learning_rate=3e-5,  # Try 1e-5, 2e-5, 3e-5, 5e-5
    lr_scheduler_type="cosine",  # Try "linear", "cosine", "polynomial"
    
    # Training dynamics
    num_train_epochs=3,  # Try 3-10 epochs
    warmup_ratio=0.1,    # Warm up learning rate
    weight_decay=0.01,   # Regularization
    
    # Batch size (adjust based on GPU memory)
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    
    # Early stopping and saving
    #load_best_model_at_end=True,
    #metric_for_best_model="eval_accuracy",
    #greater_is_better=True,
)


In [70]:
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    processing_class=tokenizer
)

In [71]:


trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0265,0.898974
2,0.0239,0.984288
3,0.0075,0.981252


TrainOutput(global_step=189, training_loss=0.01933136062016563, metrics={'train_runtime': 807.2904, 'train_samples_per_second': 3.716, 'train_steps_per_second': 0.234, 'total_flos': 391830286012032.0, 'train_loss': 0.01933136062016563, 'epoch': 3.0})

{'accuracy': {'accuracy': 0.845}, 'precision': {'precision': 0.8651685393258427}, 'recall': {'recall': 0.8020833333333334}, 'f1': {'f1': 0.8324324324324325}

TrainOutput(global_step=375, training_loss=0.2925295613606771, metrics={'train_runtime': 750.786, 'train_samples_per_second': 3.996, 'train_steps_per_second': 0.499, 'total_flos': 368950682500416.0, 'train_loss': 0.2925295613606771, 'epoch': 3.0})
{'accuracy': {'accuracy': 0.845}, 'precision': {'precision': 0.8155339805825242}, 'recall': {'recall': 0.875}, 'f1': {'f1': 0.8442211055276382}}

In [72]:

predictions = trainer.predict(tokenized_datasets["validate"])
print(predictions.predictions.shape, predictions.label_ids.shape)


preds = np.argmax(predictions.predictions, axis=-1)



(200, 2) (200,)


In [73]:
# Load multiple metrics for binary classification
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Compute all metrics
results = {
    "accuracy": accuracy.compute(predictions=preds, references=predictions.label_ids),
    "precision": precision.compute(predictions=preds, references=predictions.label_ids),
    "recall": recall.compute(predictions=preds, references=predictions.label_ids),
    "f1": f1.compute(predictions=preds, references=predictions.label_ids)
}

print(results)


{'accuracy': {'accuracy': 0.845}, 'precision': {'precision': 0.8651685393258427}, 'recall': {'recall': 0.8020833333333334}, 'f1': {'f1': 0.8324324324324325}}
