In [1]:
pip install mlflow

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading databricks_sdk-0.57.0-py3-none-any.whl.metadata (39 kB)
Collecting fastapi<1 (from mlflow-skinny==3.1.1->mlflow)
  Downloading fastapi-0.116.0-py3-none-any.whl.metadata (28 kB)
Collecting uvicorn<1 (from mlflow-skinny==3.1.1->mlflow)
  Downloading uvicorn-0.35.0-py3-none-any.whl.metadata (6.5 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloadi

In [2]:
import mlflow
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
from datasets import load_dataset
from datetime import datetime
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

mlflow.set_experiment("MLflow Electra Small Sentiment Analysis - Yelp Polarity")

dataset = load_dataset("yelp_polarity")
train_dataset = dataset["train"].shuffle(seed=42).select(range(100000))
val_dataset = dataset["test"].shuffle(seed=42).select(range(25000))

print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")

base_model_id = "google/electra-small-discriminator"
MAX_LENGTH = 512

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=MAX_LENGTH,
    padding=True,
    truncation=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_and_pad_to_fixed_length(sample):
    tokenized = tokenizer(
        text=sample["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
        return_tensors=None
    )
    tokenized["labels"] = sample["label"]
    return tokenized

train_dataset = train_dataset.map(tokenize_and_pad_to_fixed_length, batched=True)
val_dataset = val_dataset.map(tokenize_and_pad_to_fixed_length, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(
    base_model_id,
    num_labels=2
).to("cuda" if torch.cuda.is_available() else "cpu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).cpu().numpy()
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
        "precision": precision_score(labels, predictions, average="weighted"),
        "recall": recall_score(labels, predictions, average="weighted"),
    }

training_args = TrainingArguments(
    report_to="mlflow",
    run_name=f"electra-small-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}",
    output_dir="outputs",
    optim="adamw_torch",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    save_strategy="epoch",
    save_total_limit=1,
    bf16=True,
    learning_rate=2e-5,
    lr_scheduler_type="constant",
    num_train_epochs=5,
    logging_steps=100,
    warmup_steps=500,
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    ddp_find_unused_parameters=False
)

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    input_ids = torch.nn.utils.rnn.pad_sequence([torch.tensor(ids) for ids in input_ids], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence([torch.tensor(mask) for mask in attention_mask], batch_first=True, padding_value=0)
    labels = torch.tensor(labels)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    args=training_args,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

model.config.use_cache = False

trainer.train()


2025-07-07 20:36:58.323369: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751920618.532047      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751920618.594905      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025/07/07 20:37:12 INFO mlflow.tracking.fluent: Experiment with name 'MLflow Electra Small Sentiment Analysis - Yelp Polarity' does not exist. Creating a new experiment.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/256M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/38000 [00:00<?, ? examples/s]

Train size: 100000
Validation size: 25000


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/54.2M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1412,0.159843,0.9524,0.952389,0.952901,0.9524
2,0.086,0.130421,0.95776,0.957743,0.958409,0.95776
3,0.084,0.135947,0.95868,0.958663,0.959333,0.95868




TrainOutput(global_step=9375, training_loss=0.12139963826497396, metrics={'train_runtime': 3823.5368, 'train_samples_per_second': 130.769, 'train_steps_per_second': 4.087, 'total_flos': 8825892249600000.0, 'train_loss': 0.12139963826497396, 'epoch': 3.0})