# Fine-Tune a Text Classifier with Custom CSV Data + Training Curves
This notebook shows how to train a text classifier using custom CSV data and visualize training curves.

## Install Required Libraries

In [None]:
!pip install transformers datasets scikit-learn matplotlib pandas --quiet

## Load and Explore Custom CSV Data

In [None]:
from datasets import load_dataset

data_files = {
    "train": "train.csv",
    "validation": "val.csv",
    "test": "test.csv"
}
dataset = load_dataset("csv", data_files=data_files)
dataset = dataset.rename_column("txt", "text")
print(dataset["train"][0])

## Tokenize the Dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize, batched=True)

## Load Pretrained Model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

## Define Evaluation Metrics

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions)
    }

## Set Training Arguments

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy="epoch",
    report_to="none",
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics
)

## Train the Model

In [None]:
trainer.train()

## Evaluate on Test Set

In [None]:
trainer.evaluate(tokenized_dataset["test"])

## Plot Training Loss and Evaluation Metrics

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

log_history = trainer.state.log_history
log_df = pd.DataFrame(log_history)

# Plot training loss
plt.figure(figsize=(10, 4))
plt.plot(log_df['step'], log_df['loss'], label='Training Loss')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Training Loss Curve')
plt.legend()
plt.grid(True)
plt.show()

# Plot evaluation accuracy and F1
eval_steps = log_df.dropna(subset=['eval_accuracy'])
plt.figure(figsize=(10, 4))
plt.plot(eval_steps['step'], eval_steps['eval_accuracy'], label='Eval Accuracy')
plt.plot(eval_steps['step'], eval_steps['eval_f1'], label='Eval F1')
plt.xlabel('Training Steps')
plt.ylabel('Score')
plt.title('Evaluation Metrics')
plt.legend()
plt.grid(True)
plt.show()

## Save the Fine-Tuned Model

In [None]:
model.save_pretrained("./custom-sentiment-model")
tokenizer.save_pretrained("./custom-sentiment-model")

## Summary
- Fine-tuned DistilBERT on custom CSV data
- Evaluated using accuracy and F1
- Plotted training and evaluation curves
- Saved the trained model