In [1]:
# Colab convenience: clone repo if needed
from pathlib import Path

if not Path("pyproject.toml").exists():
    if not Path("pjatk_zum").exists():
        !git clone https://github.com/beep1000101/pjatk_zum.git
    else:
        print("Repo folder already present: pjatk_zum")
else:
    print("Already in repo root (pyproject.toml found)")

# Colab convenience: cd into repo folder if we cloned it
from pathlib import Path

if Path("pyproject.toml").exists():
    print("Already in repo root")
elif Path("pjatk_zum").exists():
    %cd pjatk_zum
    !python data_ingestion/sentiment_embeddings/run.py
else:
    raise FileNotFoundError("Could not find repo root (pyproject.toml) or ./pjatk_zum")

Repo folder already present: pjatk_zum
/content/pjatk_zum
[sentiment_embeddings] Downloading: https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[sentiment_embeddings] Cache file: .cache/sentiment_embeddings/aclImdb_v1.tar.gz
[sentiment_embeddings] Extracting into: .cache/sentiment_embeddings/raw
Wrote provenance: .cache/sentiment_embeddings/provenance.json


In [3]:
%pip install evaluate -q

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
from pathlib import Path
from enum import StrEnum

import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import evaluate

import pandas as pd
from utils.paths import CACHE_PATH

In [3]:
data_path = CACHE_PATH / "sentiment_embeddings"
raw_data_path = data_path / "raw" / "aclImdb"
test_directory_path = raw_data_path / "test"
train_directory_path = raw_data_path / "train"

In [4]:
class Sentiment(StrEnum):
    POS = "pos"
    NEG = "neg"

def _load_data(sentiment: Sentiment, dataset_path: Path):
    if sentiment == Sentiment.POS:
        sentiment_value = 1
    elif sentiment == Sentiment.NEG:
        sentiment_value = 0
    else:
        raise ValueError()

    files = []
    path_to_data = dataset_path / sentiment
    file_paths = path_to_data.glob("*.txt")
    for file_path in file_paths:
        with open(file=file_path) as text_file:
            files.append(text_file.read())

    sentiment_list = [sentiment_value] * len(files)

    return files, sentiment_list

def load_dataframe(sentiment: Sentiment, dataset_path: Path):
    columns = ["text", "sentiment_value"]
    df = pd.DataFrame(dict(zip(columns, _load_data(sentiment=sentiment, dataset_path=dataset_path))))
    return df

def load_dataset(dataset_path: Path):
    dataset_df = pd.concat([load_dataframe(sentiment=sentiment, dataset_path=dataset_path) for sentiment in Sentiment])
    return dataset_df


In [5]:
df_train_full = load_dataset(dataset_path=train_directory_path)

In [6]:
df_train_full

Unnamed: 0,text,sentiment_value
0,"As Jack Nicholson's directorial debut, Drive, ...",1
1,Orson Welles manages to knock me on my ass wit...,1
2,Most war films made in the US during WWII were...,1
3,I watched Lion king more times that all my fri...,1
4,There is not much more I can say about this mo...,1
...,...,...
12495,Notice I have given this 1 star if the option ...,0
12496,"For all of the hype about this film, I kept an...",0
12497,Yeah i saw the rough cuts. The unedited sex sc...,0
12498,Hammerhead is a combination between the mad sc...,0


In [7]:
train_df, val_df = train_test_split(
    df_train_full, test_size=0.2, random_state=42, stratify=df_train_full["sentiment_value"]
)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

# Trainer expects label column named "labels"
train_ds = train_ds.rename_column("sentiment_value", "labels")
val_ds = val_ds.rename_column("sentiment_value", "labels")

cols = ["input_ids", "attention_mask", "labels"]
train_ds.set_format(type="torch", columns=cols)
val_ds.set_format(type="torch", columns=cols)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

# transformers==5 uses `eval_strategy` (not `evaluation_strategy`)
args = TrainingArguments(
    output_dir="outputs/sentiment_distilbert",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
    logging_strategy="steps",
    logging_steps=50,
    save_total_limit=2,
    seed=42,
    data_seed=42,
    remove_unused_columns=True,
    fp16=False,
    bf16=False,
    use_cpu=False,
 )

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
 )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2783,0.238642,0.9056,0.905519
2,0.1718,0.270515,0.9134,0.913366


TrainOutput(global_step=2500, training_loss=0.23995217475891115, metrics={'train_runtime': 943.5632, 'train_samples_per_second': 42.392, 'train_steps_per_second': 2.65, 'total_flos': 2649285879026880.0, 'train_loss': 0.23995217475891115, 'epoch': 2.0})

In [9]:
trainer.evaluate()

{'eval_loss': 0.2705150246620178,
 'eval_accuracy': 0.9134,
 'eval_f1': 0.9133660360207616,
 'eval_runtime': 32.6417,
 'eval_samples_per_second': 153.178,
 'eval_steps_per_second': 9.589,
 'epoch': 2.0}

In [28]:
import torch

In [30]:
test_df = load_dataset(dataset_path=test_directory_path)
test_ds = Dataset.from_pandas(test_df.reset_index(drop=True).rename(columns={"sentiment_value": "labels"})).map(tokenize, batched=True)
test_ds.set_format(type="torch", columns=cols)

print(f"Type of test_ds: {type(test_ds)}")
print(f"Value of test_ds: {test_ds}")
if test_ds is None:
    print("test_ds is None!")
elif isinstance(test_ds, Dataset):
    print(f"test_ds successfully created with {len(test_ds)} rows.")
else:
    print("test_ds is an unexpected type or value.")

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Type of test_ds: <class 'datasets.arrow_dataset.Dataset'>
Value of test_ds: Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 25000
})
test_ds successfully created with 25000 rows.


In [10]:
!zip -r sentiment_distilbert_best.zip outputs/sentiment_distilbert/best


zip error: Nothing to do! (try: zip -r sentiment_distilbert_best.zip . -i outputs/sentiment_distilbert/best)


In [26]:
test_ds

In [31]:
# --- Evaluation: Run after training ---
from notebooks.sentiment_embeddings.helpers import run_inference, compute_metrics, find_hf_model_dir
import json


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Load test set (reuse your data loading logic)
labels = ["neg", "pos"]

# Run inference and compute metrics
all_preds, all_labels = run_inference(model, tokenizer, device, test_ds)
metrics = compute_metrics(all_labels, all_preds, labels)

# Save metrics
with open('metrics.json', 'w', encoding='utf-8') as f:
    json.dump(metrics, f, indent=2)
metrics

Running inference on 25000 samples...


AttributeError: 'Dataset' object has no attribute 'iterrows'

# Task
Replace the code in cell `9f6ecb1a` with the following:
```python
import torch
from torch.utils.data import DataLoader
import numpy as np
import json

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Create DataLoader for test_ds using the existing collator
# collator is DataCollatorWithPadding(tokenizer=tokenizer) from a previous cell
test_dataloader = DataLoader(test_ds, batch_size=16, collate_fn=collator, shuffle=False)

all_preds = []
all_labels = []

model.eval() # Set model to evaluation mode
with torch.no_grad():
    for batch in test_dataloader:
        # Move batch items to device, excluding 'labels' for input
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Compute metrics using the previously loaded evaluate objects 'accuracy' and 'f1'
accuracy_score = accuracy.compute(predictions=all_preds, references=all_labels)["accuracy"]
f1_score = f1.compute(predictions=all_preds, references=all_labels, average="macro")["f1"]

metrics = {
    "test_accuracy": accuracy_score,
    "test_f1_macro": f1_score
}

print("Evaluation Metrics:")
print(f"  Accuracy: {metrics['test_accuracy']:.4f}")
print(f"  F1 (macro): {metrics['test_f1_macro']:.4f}")

# Save metrics
with open('metrics.json', 'w', encoding='utf-8') as f:
    json.dump(metrics, f, indent=2)

print("\nMetrics saved to metrics.json")
```

## Modify Evaluation Cell

### Subtask:
Replace the existing code in cell `9f6ecb1a` to remove the `helpers` imports. Implement a manual inference loop using a `torch.utils.data.DataLoader` for `test_ds` to collect predictions and true labels. Then, use the previously loaded `evaluate` metrics objects (`accuracy` and `f1`) to compute and display the final evaluation metrics. Finally, save these metrics to `metrics.json`.


**Reasoning**:
The previous code failed because the `run_inference` function from `helpers.py` expected a pandas DataFrame with `iterrows()`, but `test_ds` is a Hugging Face `Dataset` object. To fix this, I will replace the cell content to implement a manual inference loop using a `torch.utils.data.DataLoader` for `test_ds`, collect predictions and true labels, and then use the already loaded `accuracy` and `f1` metric objects to compute and save the evaluation metrics.



In [32]:
import torch
import json
from torch.utils.data import DataLoader
import numpy as np

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Create a DataLoader for the test dataset
test_dataloader = DataLoader(test_ds, batch_size=16, collate_fn=collator, shuffle=False)

# Initialize lists to store predictions and true labels
all_preds = []
all_labels = []

print(f"Running inference on {len(test_ds)} samples...")
model.eval()
with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

        if (i + 1) % 100 == 0 or (i + 1) == len(test_dataloader):
            print(f"Processed batch {i+1}/{len(test_dataloader)}", flush=True)

# Compute metrics
metrics = {
    "accuracy": accuracy.compute(predictions=all_preds, references=all_labels)["accuracy"],
    "f1": f1.compute(predictions=all_preds, references=all_labels, average="macro")["f1"],
}

# Save metrics
with open('metrics.json', 'w', encoding='utf-8') as f:
    json.dump(metrics, f, indent=2)

print("\nEvaluation Metrics:")
print(metrics)

Running inference on 25000 samples...
Processed batch 100/1563
Processed batch 200/1563
Processed batch 300/1563
Processed batch 400/1563
Processed batch 500/1563
Processed batch 600/1563
Processed batch 700/1563
Processed batch 800/1563
Processed batch 900/1563
Processed batch 1000/1563
Processed batch 1100/1563
Processed batch 1200/1563
Processed batch 1300/1563
Processed batch 1400/1563
Processed batch 1500/1563
Processed batch 1563/1563

Evaluation Metrics:
{'accuracy': 0.91124, 'f1': 0.9112193800262784}
