# Fine-Tune a Text Classifier with Custom CSV Data
This notebook shows how to train a text classifier using a custom dataset in CSV format. Assumes your CSV files contain two columns: `txt` and `sentiment`.

## Install Required Libraries

In [None]:
!pip install transformers datasets scikit-learn --quiet

## Load and Explore Custom CSV Data

In [None]:
from datasets import load_dataset

data_files = {
    "train": "train.csv",
    "validation": "val.csv",
    "test": "test.csv"
}
dataset = load_dataset("csv", data_files=data_files)

# Rename column for consistency
dataset = dataset.rename_column("txt", "text")

# Preview an example
print(dataset["train"][0])

## Tokenize the Dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize, batched=True)

## Load Pretrained Model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

## Define Evaluation Metrics

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions)
    }

## Training Arguments and Trainer Setup

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics
)

## Train the Model

In [None]:
trainer.train()

## Evaluate on Test Set

In [None]:
trainer.evaluate(tokenized_dataset["test"])

## Save the Fine-Tuned Model

In [None]:
model.save_pretrained("./custom-sentiment-model")
tokenizer.save_pretrained("./custom-sentiment-model")

## Summary
- Loaded a custom dataset from CSV files
- Fine-tuned DistilBERT for binary sentiment classification
- Evaluated on a separate test set
- Saved the fine-tuned model for reuse or deployment

In [None]:
# from datasets import load_dataset
# import pandas as pd

# # Load the XSum dataset
# xsum = load_dataset("xsum")

# # Extract and convert to DataFrames
# train_df = pd.DataFrame({
#     "text": xsum["train"]["document"],
#     "summary": xsum["train"]["summary"]
# })

# val_df = pd.DataFrame({
#     "text": xsum["validation"]["document"],
#     "summary": xsum["validation"]["summary"]
# })

# test_df = pd.DataFrame({
#     "text": xsum["test"]["document"],
#     "summary": xsum["test"]["summary"]
# })

# # Save all as TSV
# train_df.to_csv("xsum_train.tsv", sep="\t", index=False)
# val_df.to_csv("xsum_val.tsv", sep="\t", index=False)
# test_df.to_csv("xsum_test.tsv", sep="\t", index=False)

# print("✅ Saved: xsum_train.tsv, xsum_val.tsv, xsum_test.tsv")
