From cleaning/preprocessing step:

In [None]:
TRAIN_CSV = "train.csv"
VAL_CSV   = "val.csv"
TEST_CSV  = "test.csv"

# Train/Val/Test split ratios
train_ratio = 0.8
val_ratio   = 0.1
test_ratio  = 0.1

## Build / fine-tune BERT

chunking approach with overlap (e.g., 128 tokens overlap).

In [None]:
def chunk_text(
    text: str,
    tokenizer,
    max_len=512,
    stride=128
):
    """
    Splits a text into overlapping 512-token chunks (by default).
    Returns a list of 'encodings' suitable for BERT (each with input_ids, attention_mask).
    """
    # Tokenize into IDs without truncation so we can see actual length
    tokens = tokenizer.encode(text, add_special_tokens=False)
    
    # We'll store each chunk's 'encodings'
    chunk_encodings = []
    
    # Slide over the token list with overlap
    start = 0
    while start < len(tokens):
        end = start + max_len
        # For the chunk we take tokens[start : end]
        chunk = tokens[start:end]
        # Now add [CLS] and [SEP] if needed – or rely on hugging face to do so.
        encoded_dict = tokenizer.encode_plus(
            chunk,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        chunk_encodings.append(encoded_dict)
        
        # Move start forward by (max_len - stride)
        start += (max_len - stride)
    
    return chunk_encodings


We need to create a new DataFrame (or Hugging Face Dataset) that has one row per chunk. Each chunk inherits the same label from the original text.

In [None]:
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

df = pd.read_csv("my_data.csv")  # has columns: "text", "label"

# This will hold expanded rows
expanded_data = {
    "input_ids": [],
    "attention_mask": [],
    "labels": []
}

for idx, row in tqdm(df.iterrows(), total=len(df)):
    text = row["text"]
    label = row["label"]  # could be 'left', 'center', 'right' or integer-coded
    
    # Convert label to int if needed
    if label == "left":
        label_id = 0
    elif label == "center":
        label_id = 1
    elif label == "right":
        label_id = 2
    else:
        # or raise an error if unknown
        label_id = 2  # fallback

    # Chunk it
    chunked_encodings = chunk_text(text, tokenizer, max_len=512, stride=128)
    
    # Add each chunk to expanded dataset
    for enc in chunked_encodings:
        expanded_data["input_ids"].append(enc["input_ids"][0])  # shape [1,512]
        expanded_data["attention_mask"].append(enc["attention_mask"][0])  # shape [1,512]
        expanded_data["labels"].append(label_id)

# Convert to torch tensors if you want, or Hugging Face Dataset
import torch
input_ids_tensor = torch.stack(expanded_data["input_ids"])       # shape [num_chunks,512]
attention_mask_tensor = torch.stack(expanded_data["attention_mask"])
labels_tensor = torch.tensor(expanded_data["labels"], dtype=torch.long)

print("Final expanded dataset shape:", input_ids_tensor.shape, attention_mask_tensor.shape, labels_tensor.shape)


Fine-Tuning BERT on the Chunked Dataset

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Create a TensorDataset
train_dataset = TensorDataset(input_ids_tensor, attention_mask_tensor, labels_tensor)

# Create DataLoader
train_batch_size = 8
train_dataloader = DataLoader(train_dataset, 
                              sampler=RandomSampler(train_dataset),
                              batch_size=train_batch_size)

# Load the model with 3 labels
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Setup optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Example: number of epochs
epochs = 3
total_steps = len(train_dataloader) * epochs

# Create learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps)

# Training loop (simplified)
for epoch_i in range(epochs):
    print(f"Epoch {epoch_i+1}/{epochs}")
    model.train()
    
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        outputs = model(
            input_ids=b_input_ids,
            attention_mask=b_input_mask,
            labels=b_labels
        )
        loss = outputs.loss
        logits = outputs.logits

        # Backprop
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if step % 100 == 0:
            print(f"  Step {step}: loss = {loss.item():.4f}")


Inference (Production Pipeline)

In [None]:
import torch.nn.functional as F

def classify_long_text(text):
    # 1. Chunk
    chunked_encs = chunk_text(text, tokenizer, max_len=512, stride=128)
    
    # 2. Inference on each chunk
    sum_logits = None
    for enc in chunked_encs:
        # Make sure it’s on the correct device
        input_ids = enc["input_ids"].to(device)
        att_mask = enc["attention_mask"].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=att_mask)
        
        logits = outputs.logits  # shape [1, 3]
        
        if sum_logits is None:
            sum_logits = logits
        else:
            sum_logits += logits

    # 3. Average logits
    avg_logits = sum_logits / len(chunked_encs)
    probs = F.softmax(avg_logits, dim=-1)  # shape [1,3]
    
    # 4. Final label & confidence
    pred_label_id = probs.argmax(dim=-1).item()
    confidence = probs[0, pred_label_id].item()
    
    # Map ID back to string
    label_map = {0: "left", 1: "center", 2: "right"}
    return label_map[pred_label_id], confidence


## Run Checks on Dataset

## Checking for Extra/Unnamed Columns in Pandas DataFrame

In [1]:
def verify_and_fix_df_columns(df):
    """
    - Check for 'Unnamed' columns and drop them if found.
    - Ensure the DataFrame has 'text' and 'label' columns.
    - Print any issues found.
    """
    # 1) Check for 'Unnamed' columns
    unnamed_cols = [col for col in df.columns if col.startswith("Unnamed")]
    if unnamed_cols:
        print(f"[INFO] Found unnamed columns: {unnamed_cols}. Dropping them.")
        df.drop(columns=unnamed_cols, inplace=True)

    # 2) Check required columns
    required_cols = {"text", "label"}
    df_cols = set(df.columns)
    missing = required_cols - df_cols
    if missing:
        raise ValueError(f"[ERROR] DataFrame is missing required columns: {missing}")

    # 3) (Optional) Check for empty text or label rows
    # e.g., rows with empty strings in 'text' or NaN labels
    if df["text"].isnull().any():
        print("[WARNING] Some rows have null text!")
    if df["label"].isnull().any():
        print("[WARNING] Some rows have null label!")

    print("[INFO] DataFrame columns verified:", list(df.columns))
    return df


## Checking Column Names After Hugging Face Dataset Conversion

In [3]:
from datasets import Dataset

def check_dataset_columns(ds, expected_cols=None):
    """
    Checks if the Dataset has the expected columns.
    ds: Hugging Face Dataset object
    expected_cols: list or set of columns (e.g. ['text', 'label'])
    """
    if not expected_cols:
        expected_cols = ["text", "label"]  # default
    ds_cols = ds.column_names
    missing = [col for col in expected_cols if col not in ds_cols]
    if missing:
        raise ValueError(f"[ERROR] Dataset is missing required columns: {missing}")
    print("[INFO] Dataset has the required columns:", ds_cols)



## Checking Tokenized Dataset for “Missing Keys”

In [4]:
def check_tokenized_dataset(ds):
    """
    Ensure the tokenized dataset has input_ids, attention_mask, 
    and label columns (or whatever your training requires).
    """
    required_keys = ["input_ids", "attention_mask", "label"]
    ds_cols = ds.column_names
    missing = [key for key in required_keys if key not in ds_cols]
    if missing:
        raise ValueError(f"[ERROR] Missing columns after tokenization: {missing}")
    print("[INFO] Tokenized dataset columns are:", ds_cols)


## Run Checks

In [5]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast

# 1) Read CSV
train_df = pd.read_csv("train.csv")
val_df   = pd.read_csv("val.csv")
test_df  = pd.read_csv("test.csv")

# 2) Verify & fix columns
train_df = verify_and_fix_df_columns(train_df)
val_df   = verify_and_fix_df_columns(val_df)
test_df  = verify_and_fix_df_columns(test_df)

# 3) Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)

check_dataset_columns(train_ds, ["text", "label"])
check_dataset_columns(val_ds, ["text", "label"])
check_dataset_columns(test_ds, ["text", "label"])

# 4) Combine into a DatasetDict
dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

# 5) Tokenize
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(
        example["text"], 
        truncation=True, 
        max_length=512
    )

encoded_dataset = dataset.map(tokenize_function, batched=True)

# 6) Check tokenized dataset
check_tokenized_dataset(encoded_dataset["train"])
check_tokenized_dataset(encoded_dataset["validation"])
check_tokenized_dataset(encoded_dataset["test"])


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


[INFO] DataFrame columns verified: ['text', 'label']
[INFO] DataFrame columns verified: ['text', 'label']
[INFO] DataFrame columns verified: ['text', 'label']
[INFO] Dataset has the required columns: ['text', 'label']
[INFO] Dataset has the required columns: ['text', 'label']
[INFO] Dataset has the required columns: ['text', 'label']


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/13888 [00:00<?, ? examples/s]

Map:   0%|          | 0/1736 [00:00<?, ? examples/s]

Map:   0%|          | 0/1738 [00:00<?, ? examples/s]

[INFO] Tokenized dataset columns are: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']
[INFO] Tokenized dataset columns are: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']
[INFO] Tokenized dataset columns are: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']
