In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
# 1) Clean out conflicting installs
!pip uninstall -y deepspeed torch torchvision torchaudio transformers tokenizers accelerate datasets evaluate simpletransformers
!pip cache purge -y

# 2) Install CUDA 12.1 PyTorch build compatible with DeepSpeed 0.13.5
!pip install -U torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121

# 3) Install the NLP/HPC stack with compatible versions
!pip install -U deepspeed==0.13.5 transformers==4.44.2 datasets==2.19.1 accelerate==0.33.0 evaluate==0.4.2 bitsandbytes==0.43.1

# 4) Hard restart the runtime so new libs load
import os; os.kill(os.getpid(), 9)



[0mFound existing installation: torch 2.8.0+cu126
Uninstalling torch-2.8.0+cu126:
  Successfully uninstalled torch-2.8.0+cu126
Found existing installation: torchvision 0.23.0+cu126
Uninstalling torchvision-0.23.0+cu126:
  Successfully uninstalled torchvision-0.23.0+cu126
Found existing installation: torchaudio 2.8.0+cu126
Uninstalling torchaudio-2.8.0+cu126:
  Successfully uninstalled torchaudio-2.8.0+cu126
Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: tokenizers 0.22.1
Uninstalling tokenizers-0.22.1:
  Successfully uninstalled tokenizers-0.22.1
Found existing installation: accelerate 1.11.0
Uninstalling accelerate-1.11.0:
  Successfully uninstalled accelerate-1.11.0
Found existing installation: datasets 4.0.0
Uninstalling datasets-4.0.0:
  Successfully uninstalled datasets-4.0.0
[0m
Usage:   
  pip3 cache dir
  pip3 cache info
  pip3 cache list [<pattern>] [--format=[huma

Collecting deepspeed==0.13.5
  Downloading deepspeed-0.13.5.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.19.1
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting accelerate==0.33.0
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting evaluate==0.4.2
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting bitsandbytes==0.43.1
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting hjson (from deepspeed==0.13.5)
  Downloading hjson-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting ninja (f

In [None]:
import torch, transformers, deepspeed, inspect
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("DeepSpeed:", deepspeed.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# Confirm TrainingArguments supports modern args
from transformers import TrainingArguments
print("Has evaluation_strategy:", "evaluation_strategy" in inspect.signature(TrainingArguments.__init__).parameters)


[2025-10-23 12:13:16,422] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Torch: 2.3.1+cu121
Transformers: 4.44.2
DeepSpeed: 0.13.5
CUDA available: True
GPU: Tesla T4
Has evaluation_strategy: True


In [None]:
!pip -q uninstall -y cudf-cu12 pylibcudf-cu12

In [None]:
!pip -q install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
import os, sys
os.kill(os.getpid(), 9)

In [None]:
# --- Imports & setup ---
import os, time, json, random, math, glob
import numpy as np
import pandas as pd
import torch

from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer
)

# Metrics via scikit-learn (no HF evaluate)
from sklearn.metrics import accuracy_score, f1_score

# Reproducibility
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": float(accuracy_score(labels, preds)),
        "f1": float(f1_score(labels, preds, average="macro"))  # macro-F1 for multi-class fairness
    }

def now(): return time.time()

def device_info():
    if not torch.cuda.is_available():
        return {"gpu":"CPU","mem_gb":"NA"}
    try:
        import pynvml
        pynvml.nvmlInit()
        h = pynvml.nvmlDeviceGetHandleByIndex(0)
        name = pynvml.nvmlDeviceGetName(h).decode()
        mem  = pynvml.nvmlDeviceGetMemoryInfo(h).total / (1024**3)
        return {"gpu":name, "mem_gb": f"{mem:.1f}"}
    except Exception:
        return {"gpu": torch.cuda.get_device_name(0), "mem_gb":"?"}

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))



CUDA available: True
GPU: Tesla T4


In [None]:
def load_imdb(tokenizer, max_len=128):
    ds = load_dataset("imdb")
    def tok(ex):
        return tokenizer(ex["text"], truncation=True, max_length=max_len)
    ds = ds.map(tok, batched=True, remove_columns=["text"])
    ds = ds.rename_column("label","labels")
    return ds

def load_sentiment140(tokenizer, max_len=128, train_samples=200_000, test_samples=10_000):
    # labels in HF 'sentiment140' are 0 = negative, 4 = positive; we’ll map 2=neutral if present via dataset
    ds = load_dataset("sentiment140")
    # the 'sentiment' field holds labels; text in 'text'
    def map_labels(ex):
        # Normalize labels to {0,1} by mapping 4->1 (binary), OR keep 0/2/4 (multi-class) if you prefer.
        # We'll keep 3-class if present: {0:neg, 2:neutral, 4:pos}. Replace 4->2 if you want strict 0/1 binary.
        lab = ex["sentiment"]
        ex["labels"] = lab
        ex["text"] = ex["text"]
        return ex
    ds = ds.map(map_labels)
    tokenizer.model_max_length = max_len
    def tok(ex):
        return tokenizer(ex["text"], truncation=True, max_length=max_len)
    ds = ds.map(tok, batched=True, remove_columns=[c for c in ds["train"].column_names if c not in ["input_ids","attention_mask","labels"]])
    # Subsample for speed in Colab; adjust up later for full runs
    ds["train"] = ds["train"].shuffle(SEED).select(range(min(train_samples, len(ds["train"]))))
    ds["test"]  = ds["test"].shuffle(SEED).select(range(min(test_samples,  len(ds["test"]))))
    return ds

def load_amazon_reviews_multi_en(tokenizer, max_len=128, train_samples=100_000, test_samples=10_000):
    ds = load_dataset("amazon_reviews_multi", "en")
    # 'stars' 1..5 -> labels 0..4
    def map_labels(ex):
        ex["labels"] = int(ex["stars"])-1
        ex["text"]   = ex["review_body"]
        return ex
    ds = ds.map(map_labels, remove_columns=[c for c in ds["train"].column_names if c not in ["text","labels"]])
    def tok(ex):
        return tokenizer(ex["text"], truncation=True, max_length=max_len)
    ds = ds.map(tok, batched=True, remove_columns=["text"])
    ds["train"] = ds["train"].shuffle(SEED).select(range(min(train_samples, len(ds["train"]))))
    ds["test"]  = ds["test"].shuffle(SEED).select(range(min(test_samples,  len(ds["test"]))))
    return ds


In [None]:
from pathlib import Path
import inspect, transformers

def train_run(
    dataset_name="imdb",
    model_name="bert-base-uncased",
    output_dir="runs/imdb_baseline_fp16",
    max_len=128,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    lr=2e-5,
    warmup_ratio=0.1,
    gradient_accumulation_steps=1,
    fp16=True,
    deepspeed_json=None
):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    if dataset_name == "imdb":
        ds = load_imdb(tokenizer, max_len)
        num_labels = 2

    elif dataset_name == "sentiment140":
        # NEW: loader returns (ds, num_labels); choose binary=True to drop neutral
        ds, num_labels = load_sentiment140(
            tokenizer,
            max_len=max_len,
            train_samples=200_000,
            test_samples=10_000,
        )

    elif dataset_name == "amazon_multi":
        ds = load_amazon_reviews_multi_en(tokenizer, max_len)
        num_labels = 5

    else:
        raise ValueError("Unknown dataset_name")

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8 if fp16 else None)

    TA = transformers.TrainingArguments
    params = inspect.signature(TA.__init__).parameters

    kw = dict(
        output_dir=output_dir,
        learning_rate=lr,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        weight_decay=0.01,
        fp16=fp16,
        report_to="none",
        load_best_model_at_end=True,
    )

    extra = {
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "logging_strategy": "steps",
        "logging_steps": 50,
        "metric_for_best_model": "f1",
        "greater_is_better": True,
        "deepspeed": deepspeed_json
    }
    for k, v in extra.items():
        if k in params:
            kw[k] = v

    args = TA(**kw)

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds["train"],
        eval_dataset=ds["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    t0 = now()
    trainer.train()
    t1 = now()
    eval_out = trainer.evaluate()

    total_train_examples = len(ds["train"])
    seconds = t1 - t0
    throughput = total_train_examples / seconds if seconds > 0 else float("nan")

    info = device_info()
    results = {
        "dataset": dataset_name,
        "model": model_name,
        "gpu": info["gpu"],
        "gpu_mem_gb": info["mem_gb"],
        "epochs": num_train_epochs,
        "batch_train": per_device_train_batch_size,
        "batch_eval": per_device_eval_batch_size,
        "grad_accum": gradient_accumulation_steps,
        "seq_len": max_len,
        "fp16": fp16,
        "deepspeed": bool(deepspeed_json),
        "train_time_sec": round(seconds, 2),
        "throughput_sps": round(throughput, 2),
    }
    for k, v in eval_out.items():
        if isinstance(v, (int, float)):
            results[k] = float(v)

    Path(output_dir).mkdir(parents=True, exist_ok=True)
    pd.DataFrame([results]).to_csv(os.path.join(output_dir, "summary.csv"), index=False)
    print("RESULTS:", results)
    return results


In [None]:
# --- Optional: DeepSpeed ZeRO-2 config ---
os.makedirs("ds_cfg", exist_ok=True)
ds_cfg = {
  "fp16": {"enabled": True},
  "zero_optimization": {
    "stage": 2,
    "allgather_partitions": True,
    "overlap_comm": True,
    "reduce_scatter": True,
    "contiguous_gradients": True
  },
  "gradient_accumulation_steps": 1,
  "train_micro_batch_size_per_gpu": 16
}
with open("ds_cfg/zero2.json","w") as f:
    json.dump(ds_cfg, f, indent=2)
print("Saved DeepSpeed config at ds_cfg/zero2.json")



Saved DeepSpeed config at ds_cfg/zero2.json


In [None]:
r1 = train_run(
    dataset_name="imdb",
    model_name="bert-base-uncased",
    output_dir="runs/imdb_baseline_fp16",
    max_len=128,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    lr=2e-5,
    gradient_accumulation_steps=1,
    fp16=True,
    deepspeed_json=None
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2952,0.290633,0.8816,0.881489
2,0.1718,0.31622,0.89148,0.891475


RESULTS: {'dataset': 'imdb', 'model': 'bert-base-uncased', 'gpu': 'Tesla T4', 'gpu_mem_gb': '?', 'epochs': 2, 'batch_train': 16, 'batch_eval': 32, 'grad_accum': 1, 'seq_len': 128, 'fp16': True, 'deepspeed': False, 'train_time_sec': 528.21, 'throughput_sps': 47.33, 'eval_loss': 0.31622007489204407, 'eval_accuracy': 0.89148, 'eval_f1': 0.8914753865752735, 'eval_runtime': 44.4491, 'eval_samples_per_second': 562.441, 'eval_steps_per_second': 17.593, 'epoch': 2.0}


In [None]:
# === Force CPU for this session (prevents any CUDA calls) ===
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""   # hide GPUs from torch
os.environ["TOKENIZERS_PARALLELISM"] = "false"



In [None]:
from datasets import load_dataset, ClassLabel

def load_sentiment140(tokenizer, max_len=128, train_samples=200_000, test_samples=10_000):
    ds = load_dataset("sentiment140")

    # Keep only negative (0) and positive (4); drop neutral (2)
    ds = ds.filter(lambda ex: ex["sentiment"] in (0, 4))

    def map_labels(ex):
        ex["labels"] = 0 if ex["sentiment"] == 0 else 1  # {0,4} -> {0,1}
        ex["text"]   = ex["text"]
        return ex

    ds = ds.map(map_labels)

    # Make labels explicit ClassLabel (2 classes) and cast dtype
    features = ds["train"].features.copy()
    features["labels"] = ClassLabel(num_classes=2, names=["negative","positive"])
    ds = ds.cast_column("labels", features["labels"])

    # Tokenize
    def tok(ex):
        return tokenizer(ex["text"], truncation=True, max_length=max_len)
    keep_cols = ["text","labels"]
    ds = ds.map(tok, batched=True, remove_columns=[c for c in ds["train"].column_names if c not in keep_cols])

    # Subsample for speed (adjust as needed)
    ds["train"] = ds["train"].shuffle(42).select(range(min(train_samples, len(ds["train"]))))
    ds["test"]  = ds["test"].shuffle(42).select(range(min(test_samples,  len(ds["test"]))))

    # Ensure torch format & dtypes
    ds = ds.with_format(type="torch", columns=["input_ids","attention_mask","labels"])

    # Safety check
    utrain = set(ds["train"]["labels"].tolist())
    utest  = set(ds["test"]["labels"].tolist())
    print("Unique train labels:", utrain, "Unique test labels:", utest)
    assert utrain.issubset({0,1}) and utest.issubset({0,1}), "Labels not in {0,1}"
    return ds, 2  # (dataset, num_labels)


In [None]:
from pathlib import Path
import inspect, transformers
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)

def train_run(
    dataset_name="imdb",
    model_name="bert-base-uncased",
    output_dir="runs/imdb_baseline_fp16",
    max_len=128,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    lr=2e-5,
    warmup_ratio=0.1,
    gradient_accumulation_steps=1,
    fp16=True,
    deepspeed_json=None,
    use_cpu=False,   # <-- NEW FLAG
):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    # IMDB dataset (only)
    ds = load_imdb(tokenizer, max_len)
    num_labels = 2

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer, pad_to_multiple_of=8 if (fp16 and not use_cpu) else None
    )

    TA = transformers.TrainingArguments
    params = inspect.signature(TA.__init__).parameters

    kw = dict(
        output_dir=output_dir,
        learning_rate=lr,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        weight_decay=0.01,
        fp16=(fp16 and not use_cpu),
        report_to="none",
        load_best_model_at_end=True,
        no_cuda=use_cpu,   # <-- key for CPU mode
    )

    extra = {
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "logging_strategy": "steps",
        "logging_steps": 50,
        "metric_for_best_model": "f1",
        "greater_is_better": True,
        "deepspeed": (None if use_cpu else deepspeed_json),
    }
    for k, v in extra.items():
        if k in params:
            kw[k] = v

    args = TA(**kw)

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds["train"],
        eval_dataset=ds["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    t0 = now()
    trainer.train()
    t1 = now()
    eval_out = trainer.evaluate()

    total_train_examples = len(ds["train"])
    seconds = t1 - t0
    throughput = total_train_examples / seconds if seconds > 0 else float("nan")

    info = device_info()
    results = {
        "dataset": dataset_name,
        "model": model_name,
        "gpu": ("CPU" if use_cpu else info["gpu"]),
        "gpu_mem_gb": ("NA" if use_cpu else info["mem_gb"]),
        "epochs": num_train_epochs,
        "batch_train": per_device_train_batch_size,
        "batch_eval": per_device_eval_batch_size,
        "grad_accum": gradient_accumulation_steps,
        "seq_len": max_len,
        "fp16": bool(kw["fp16"]),
        "deepspeed": bool(extra["deepspeed"]) if ("deepspeed" in extra and extra["deepspeed"] is not None) else False,
        "train_time_sec": round(seconds, 2),
        "throughput_sps": round(throughput, 2),
    }
    for k, v in eval_out.items():
        if isinstance(v, (int, float)):
            results[k] = float(v)

    Path(output_dir).mkdir(parents=True, exist_ok=True)
    pd.DataFrame([results]).to_csv(os.path.join(output_dir, "summary.csv"), index=False)
    print("RESULTS:", results)
    return results




In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
import torch, transformers, inspect
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# Confirm the arg exists in your installed version
has_eval_arg = "evaluation_strategy" in inspect.signature(transformers.TrainingArguments.__init__).parameters
print("TrainingArguments supports `evaluation_strategy`:", has_eval_arg)



Torch: 2.5.1+cu121
Transformers: 4.44.2
CUDA available: True
GPU: Tesla T4
TrainingArguments supports `evaluation_strategy`: True


In [None]:
r1

{'dataset': 'imdb',
 'model': 'bert-base-uncased',
 'gpu': 'Tesla T4',
 'gpu_mem_gb': '?',
 'epochs': 2,
 'batch_train': 16,
 'batch_eval': 32,
 'grad_accum': 1,
 'seq_len': 128,
 'fp16': True,
 'deepspeed': False,
 'train_time_sec': 528.21,
 'throughput_sps': 47.33,
 'eval_loss': 0.31622007489204407,
 'eval_accuracy': 0.89148,
 'eval_f1': 0.8914753865752735,
 'eval_runtime': 44.4491,
 'eval_samples_per_second': 562.441,
 'eval_steps_per_second': 17.593,
 'epoch': 2.0}

In [None]:
import pandas as pd, os

os.makedirs("runs/imdb_baseline_fp16", exist_ok=True)
pd.DataFrame([r1]).to_csv("runs/imdb_baseline_fp16/summary.csv", index=False)
print("IMDB results saved → runs/imdb_baseline_fp16/summary.csv")


IMDB results saved → runs/imdb_baseline_fp16/summary.csv


In [None]:
!ls -lh runs/imdb_baseline_fp16/

total 12K
drwxr-xr-x 2 root root 4.0K Oct 23 12:37 checkpoint-1563
drwxr-xr-x 2 root root 4.0K Oct 23 12:41 checkpoint-3126
-rw-r--r-- 1 root root  366 Oct 23 13:07 summary.csv


In [None]:
!find /content/runs -name "*.csv"

/content/runs/imdb_baseline_fp16/summary.csv


In [None]:
from google.colab import files
files.download("/content/runs/imdb_baseline_fp16/summary.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Hide GPUs so PyTorch never touches CUDA (avoids sticky asserts)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""    # must be set BEFORE importing torch
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# (Optional) also tell HF libs not to try GPU
os.environ["ACCELERATE_USE_CPU"] = "true"


In [None]:
# ==== IMDB on CPU only (clean + self-contained) ====
# Hides GPUs before importing torch, so no CUDA code runs at all.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""     # hide any GPU
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["ACCELERATE_USE_CPU"] = "true"

# ---- knobs you can tweak (keep model/batch same as your GPU run if you want a fair comparison) ----
MODEL_NAME        = "bert-base-uncased"
BATCH_TRAIN       = 8          # set to SAME batch as your GPU run for apples-to-apples
BATCH_EVAL        = 16
EPOCHS            = 1          # 1 epoch is enough for timing baseline
MAX_LEN           = 128        # set 64 for faster runs
TRAIN_SAMPLES     = 5000      # e.g., 5000 for faster CPU runs; None = use full train split
TEST_SAMPLES      = 5000       # e.g., 5000; None = full test split
OUTDIR            = "runs/imdb_cpu_baseline"
SEED              = 42
# ---------------------------------------------------------------------------------------------------

import time, random, numpy as np, pandas as pd, inspect
import torch
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)
from sklearn.metrics import accuracy_score, f1_score

# Reproducibility (CPU-only)
random.seed(SEED); np.random.seed(SEED); torch.random.manual_seed(SEED)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": float(accuracy_score(labels, preds)),
        "f1": float(f1_score(labels, preds, average="macro"))
    }

# Load & tokenize IMDB
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
ds = load_dataset("imdb")
def _tok(ex): return tok(ex["text"], truncation=True, max_length=MAX_LEN)
ds = ds.map(_tok, batched=True, remove_columns=["text"]).rename_column("label","labels")

# Optional subsetting for faster CPU runs
if TRAIN_SAMPLES is not None:
    ds["train"] = ds["train"].shuffle(SEED).select(range(min(TRAIN_SAMPLES, len(ds["train"]))))
if TEST_SAMPLES is not None:
    ds["test"]  = ds["test"].shuffle(SEED).select(range(min(TEST_SAMPLES,  len(ds["test"]))))

collator = DataCollatorWithPadding(tokenizer=tok)

# Model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Version-agnostic TrainingArguments (handles eval_strategy/evaluation_strategy rename)
TA = TrainingArguments
params = inspect.signature(TA.__init__).parameters
kw = dict(
    output_dir=OUTDIR,
    no_cuda=True,                          # <- CPU only
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    learning_rate=2e-5,
    num_train_epochs=EPOCHS,
    warmup_ratio=0.1,
    weight_decay=0.01,
    save_strategy="no",
    logging_strategy="steps",
    logging_steps=50,
    report_to="none",
)
if "eval_strategy" in params:
    kw["eval_strategy"] = "epoch"
elif "evaluation_strategy" in params:
    kw["evaluation_strategy"] = "epoch"
args = TA(**kw)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics
)

t0 = time.time()
trainer.train()
t1 = time.time()
eval_out = trainer.evaluate()

seconds = t1 - t0
throughput = len(ds["train"]) / seconds if seconds > 0 else float("nan")

res = {
    "dataset": "imdb" if TRAIN_SAMPLES is None else f"imdb_subset_{TRAIN_SAMPLES}",
    "model": MODEL_NAME,
    "mode": "CPU",
    "gpu": "CPU",
    "epochs": EPOCHS,
    "batch_train": BATCH_TRAIN,
    "batch_eval": BATCH_EVAL,
    "seq_len": MAX_LEN,
    "fp16": False,
    "train_time_sec": round(seconds, 2),
    "throughput_sps": round(throughput, 2),
}
for k, v in eval_out.items():
    if isinstance(v, (int, float)):
        res[k] = float(v)

os.makedirs(OUTDIR, exist_ok=True)
pd.DataFrame([res]).to_csv(f"{OUTDIR}/summary.csv", index=False)
print("CPU RESULTS:", res)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3712,0.331782,0.87,0.869796


CPU RESULTS: {'dataset': 'imdb_subset_5000', 'model': 'bert-base-uncased', 'mode': 'CPU', 'gpu': 'CPU', 'epochs': 1, 'batch_train': 8, 'batch_eval': 16, 'seq_len': 128, 'fp16': False, 'train_time_sec': 9765.92, 'throughput_sps': 0.51, 'eval_loss': 0.33178240060806274, 'eval_accuracy': 0.87, 'eval_f1': 0.8697958190115411, 'eval_runtime': 2088.6191, 'eval_samples_per_second': 2.394, 'eval_steps_per_second': 0.15, 'epoch': 1.0}


In [None]:
!find runs -name "*.csv"

runs/imdb_cpu_baseline/summary.csv


In [None]:
from google.colab import files
files.download("runs/imdb_cpu_baseline/summary.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>