In [77]:
import pandas as pd
import json
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support
import ast

In [62]:
def load_data(csv_path, json_path):
    """
    Load data from CSV and JSON files.
    """
    # Load CSV
    try:
        df = pd.read_csv(csv_path, encoding='utf-8')
    except Exception as e:
        raise IOError(f"Error reading CSV file: {e}")
    
    # Check for required columns
    if 'id' not in df.columns or 'result' not in df.columns:
        raise ValueError("CSV must contain 'id' and 'result' columns.")
    
    # Load JSON
    try:
        with open(json_path, 'r') as f:
            highlights = json.load(f)
    except Exception as e:
        raise IOError(f"Error reading JSON file: {e}")
    
    return df, highlights

def preprocess_labels(df, highlights):
    """
    Assign binary labels to tokens based on highlights.json.
    """
    processed_data = []
    
    for id, highlighed_indices in highlights.items():
        # Get the 'result' for the current id
        row = df[df['id'] == id]
        if row.empty:
            print(f"Warning: ID {id} not found in CSV.")
            continue
        result_str = row['result'].values[0]
        tokens = result_str.split()
        labels = [1 if idx in highlighed_indices else 0 for idx in range(len(tokens))]
        processed_data.append({
            'id': id,
            'tokens': tokens,
            'labels': labels
        })
    
    return processed_data

def print_sample_tokens(processed_data, num_samples=2):
    """
    Print sample tokens with their labels for verification.
    """
    for sample in processed_data[:num_samples]:
        print(f"ID: {sample['id']}")
        print("Tokens and Labels:")
        for token, label in zip(sample['tokens'], sample['labels']):
            print(f"{token}: {label}")
        print("\n" + "-"*50 + "\n")

def create_huggingface_dataset(processed_data):
    """
    Convert processed data into a HuggingFace Dataset.
    """
    dataset_dict = {
        'id': [item['id'] for item in processed_data],
        'tokens': [item['tokens'] for item in processed_data],
        'labels': [item['labels'] for item in processed_data]
    }
    
    dataset = Dataset.from_dict(dataset_dict)
    return dataset

# Tokenizer and model will be loaded globally for use in functions
tokenizer = None  # Will be initialized in main

def tokenize_and_align_labels(examples):
    """
    Tokenize the inputs and align labels with tokenized outputs.
    Assign labels 0 or 1 to all tokens, including subwords.
    """
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=514,  # Adjust as needed
    )

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # Special tokens
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids)
    
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

def compute_metrics(p):
    """
    Compute precision, recall, and F1-score for binary classification.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    # Flatten the predictions and labels
    true_labels = labels.flatten()
    true_predictions = predictions.flatten()
    
    # Remove ignored index (label == -100)
    mask = true_labels != -100
    true_labels = true_labels[mask]
    true_predictions = true_predictions[mask]
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, true_predictions, average='binary', zero_division=0
    )
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [63]:
CSV_PATH = 'data.csv'         # Path to your data.csv
JSON_PATH = 'highlights.json' # Path to your highlights.json

# Step 1: Load data
df, highlights = load_data(CSV_PATH, JSON_PATH)

In [64]:
# Step 2: Preprocess labels
processed_data = preprocess_labels(df, highlights)

# Step 3: Print sample tokens for verification
# print_sample_tokens(processed_data, num_samples=2)

# Step 4: Create HuggingFace Dataset
dataset = create_huggingface_dataset(processed_data)

# Step 5: Split the dataset into train and validation
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']

global tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base", use_fast=True, add_prefix_space=True)

tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval = eval_dataset.map(tokenize_and_align_labels, batched=True)



Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [65]:
tokens = tokenized_train['input_ids'][0]
selected = [tokens[i] for i in range(len(tokens)) if tokenized_train['labels'][0][i] == 1]
print(len(tokenized_train['labels'][0]))
len(tokenized_train['labels'][0])
print(f"Tokens: {selected}")


514
Tokens: [14826, 7, 3800, 3743, 1133, 7852, 2617, 13612, 52, 17, 27, 548, 57, 2754, 5, 539, 13, 81, 1718, 107, 4]


In [66]:
# Decode the tokens to get the original words
decoded_words = tokenizer.decode(selected, skip_special_tokens=True)
print(decoded_words)

 Welcome to Carefree Janitorial Supply we’ve been serving the industry for over 35 years.


In [82]:
# Download configuration from huggingface.co and cache.
model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=2)
for param in model.roberta.parameters():
    param.requires_grad = False

# Unfreeze parameters in the classification head
for param in model.classifier.parameters():
    param.requires_grad = True

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [83]:
print(model)

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (L

In [69]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label for label, pred in zip(label_seq, pred_seq) if label != -100]
        for label_seq, pred_seq in zip(labels, predictions)
    ]
    true_preds = [
        [pred for label, pred in zip(label_seq, pred_seq) if label != -100]
        for label_seq, pred_seq in zip(labels, predictions)
    ]

    flat_true = [label for sublist in true_labels for label in sublist]
    flat_preds = [pred for sublist in true_preds for pred in sublist]

    precision, recall, f1, _ = precision_recall_fscore_support(flat_true, flat_preds, average='binary')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [84]:
# 7. Set up training arguments
training_args = TrainingArguments(
    output_dir="./roberta_token_classifier",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

# 8. Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



In [85]:
trainer.train()
    
# Step 12: Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6119183301925659, 'eval_precision': 0.14655172413793102, 'eval_recall': 0.38636363636363635, 'eval_f1': 0.2125, 'eval_runtime': 0.0482, 'eval_samples_per_second': 20.731, 'eval_steps_per_second': 20.731, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6113185882568359, 'eval_precision': 0.1391304347826087, 'eval_recall': 0.36363636363636365, 'eval_f1': 0.20125786163522014, 'eval_runtime': 0.0505, 'eval_samples_per_second': 19.804, 'eval_steps_per_second': 19.804, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6107853651046753, 'eval_precision': 0.1391304347826087, 'eval_recall': 0.36363636363636365, 'eval_f1': 0.20125786163522014, 'eval_runtime': 0.0482, 'eval_samples_per_second': 20.727, 'eval_steps_per_second': 20.727, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6103192567825317, 'eval_precision': 0.13274336283185842, 'eval_recall': 0.3409090909090909, 'eval_f1': 0.19108280254777069, 'eval_runtime': 0.0498, 'eval_samples_per_second': 20.089, 'eval_steps_per_second': 20.089, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6099191308021545, 'eval_precision': 0.13274336283185842, 'eval_recall': 0.3409090909090909, 'eval_f1': 0.19108280254777069, 'eval_runtime': 0.0483, 'eval_samples_per_second': 20.724, 'eval_steps_per_second': 20.724, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.609585165977478, 'eval_precision': 0.13274336283185842, 'eval_recall': 0.3409090909090909, 'eval_f1': 0.19108280254777069, 'eval_runtime': 0.0487, 'eval_samples_per_second': 20.527, 'eval_steps_per_second': 20.527, 'epoch': 6.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6093194484710693, 'eval_precision': 0.13392857142857142, 'eval_recall': 0.3409090909090909, 'eval_f1': 0.1923076923076923, 'eval_runtime': 0.0547, 'eval_samples_per_second': 18.298, 'eval_steps_per_second': 18.298, 'epoch': 7.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6091198325157166, 'eval_precision': 0.13636363636363635, 'eval_recall': 0.3409090909090909, 'eval_f1': 0.1948051948051948, 'eval_runtime': 0.0608, 'eval_samples_per_second': 16.438, 'eval_steps_per_second': 16.438, 'epoch': 8.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6089869737625122, 'eval_precision': 0.13636363636363635, 'eval_recall': 0.3409090909090909, 'eval_f1': 0.1948051948051948, 'eval_runtime': 0.0525, 'eval_samples_per_second': 19.041, 'eval_steps_per_second': 19.041, 'epoch': 9.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6089204549789429, 'eval_precision': 0.13636363636363635, 'eval_recall': 0.3409090909090909, 'eval_f1': 0.1948051948051948, 'eval_runtime': 0.0604, 'eval_samples_per_second': 16.562, 'eval_steps_per_second': 16.562, 'epoch': 10.0}
{'train_runtime': 1.7102, 'train_samples_per_second': 5.847, 'train_steps_per_second': 5.847, 'train_loss': 0.6529894828796386, 'epoch': 10.0}


  0%|          | 0/1 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.6089204549789429, 'eval_precision': 0.13636363636363635, 'eval_recall': 0.3409090909090909, 'eval_f1': 0.1948051948051948, 'eval_runtime': 0.051, 'eval_samples_per_second': 19.617, 'eval_steps_per_second': 19.617, 'epoch': 10.0}


In [86]:
def infer_highlighted_tokens(text, tokenizer, model, max_length=514, device=None):
    """
    Inference function to predict highlighted tokens in a given text sequence.

    Args:
        text (str): Input text with space-separated tokens.
        tokenizer: HuggingFace tokenizer.
        model: Trained HuggingFace model.
        max_length (int, optional): Maximum token length for the tokenizer. Defaults to 128.
        device (str, optional): Device to run the model on ('cpu' or 'cuda').
                                If None, automatically selects based on availability.

    Prints:
        - Original input text.
        - List of tokens predicted as highlighted.
    """
    # Determine the device to use
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Move model to the appropriate device
    model.to(device)
    model.eval()  # Set model to evaluation mode

    # Preprocess the input text
    tokens = text.split()  # Assuming space-separated tokens
    encoding = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        padding='max_length',
        truncation=True,
        max_length=max_length,
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Shape: (batch_size, seq_length, num_labels)
    
    predictions = torch.argmax(logits, dim=2).squeeze().tolist()  # Shape: (seq_length,)
    
    # Convert input_ids back to tokens
    predicted_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    
    # Extract highlighted tokens (labels == 1), skipping special tokens and punctuation
    highlighted_tokens = []
    for token, pred in zip(predicted_tokens, predictions):
        if token in tokenizer.all_special_tokens:
            continue  # Skip special tokens like <s>, </s>, <pad>
        # If you want to skip punctuation, uncomment the following lines:
        # if token in punctuation_marks:
        #     continue  # Skip punctuation tokens
        if pred == 1:
            # Clean the token by removing any leading 'Ġ' or other special characters if present
            clean_token = token.replace('Ġ', '').replace('▁', '')
            highlighted_tokens.append(clean_token)
    
    # Reconstruct the original sequence for display
    original_sequence = ' '.join(tokens)
    
    # Print the results
    print("Original Sequence:")
    print(original_sequence)
    
    print("\nPredicted Highlighted Tokens:")
    if highlighted_tokens:
        print(' '.join(highlighted_tokens))
    else:
        print("No tokens predicted as highlighted.")

In [87]:
infer_highlighted_tokens(' '.join(train_dataset['tokens'][0][:100]), tokenizer, model, device='cpu')

Original Sequence:
** Cleaning Supplies Trash Liners Cleaning Chemicals Equipment Skin Care Paper Products Janitorial Supplies and Equipment Welcome to Carefree Janitorial Supply we’ve been serving the industry for over 35 years. Our trucks service Northwest Louisiana and East Texas, Carefree Janitorial Supply prides itself in providing Quality Products, the Best Service, and the Most Competitive Prices. We are conveniently located just off I-20 at 405 Barksdale Boulevard in Bossier City, Louisiana. Please feel free to browse our online store. If you have any questions or comments contact us 24/7 at wecare@carefreejanitorial.com or for immediate assistance Monday-Friday 8 a.m. to 5 p.m.

Predicted Highlighted Tokens:
Trash ing Skin Paper Jan itor ial free Jan itor ial Supply ve serving over 35 . Our service and East free Jan itor ial Supply itself providing Quality the Best Service Prices We located off 405 ks dale Boulevard Boss Please free browse If have questions or comments 7 @ fre