# Numerical Data Selection Using General-Purpose Models

Authors: Brandin Chase and Bryson Steck

[Reference Paper](https://proceedings.neurips.cc/paper_files/paper/2023/hash/047682108c3b053c61ad2da5a6057b4e-Abstract-Conference.html)

## Imports


In [None]:
!pip install datasets distinctipy matplotlib numpy pandas tensorflow scikit_learn wget torch transformers
import os
import random
import warnings
import time
import distinctipy
import wget
import zipfile
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
from collections import defaultdict
from datasets import load_dataset
from tqdm import tqdm
from tensorflow.keras import layers, models, optimizers, datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,  BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from scipy.sparse.linalg import eigsh
from torch.nn import CrossEntropyLoss
from transformers import AdamW
from transformers import get_scheduler
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

## Experimentation Variables

These variables will be used for the experimentation code block at the bottom of the notebook.

In [None]:
# This determines the sample size you want to use out of the IMDb dataset.
# Higher values result in higher quality data, as the expense of RAM and time.
# Recommended: 50-200
dataset_samples = 100

# The attention ratio.
# Recommended values: 0.1-0.9
tau_experiment = [0.2,0.4,0.6,0.8]

# The centroid number, used for clustering.
# We found modifying this value does not drastically change the FreeSel algorithm's effectiveness.
# Recommended values: [1, 3, 5, 7]
k_experiment = [3]

# The annotation budget size, controls the amount of "images" the FreeSel algorithm selects for the task model.
# Recommended values: 5%-50% of the dataset_samples variable
# Found in the last code cell.
b_experiment = [dataset_samples * 0.10, dataset_samples * 0.30, dataset_samples * 0.50]

## Hand Motion Capture Data - Preprocessing

This code will automatically download the dataset if it doesn't already exist.

[Source dataset](https://archive.ics.uci.edu/dataset/405/motion+capture+hand+postures)

In [None]:
if not os.path.exists("motion_capture_data"):
  # Download the dataset
  url = "https://archive.ics.uci.edu/static/public/405/motion+capture+hand+postures.zip"
  wget.download(url)

  # Extract the dataset
  with zipfile.ZipFile('motion+capture+hand+postures.zip', 'r') as zip_ref:
      zip_ref.extractall('motion_capture_data')

data = pd.read_csv('motion_capture_data/Postures.csv')
for col in data.columns:
  data[col] = data[col].apply(lambda x: -np.inf if x == '?' else x)

X = data.iloc[:, 1:11]
y = data.iloc[:, 0]
print(set(y))
X.head()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

## Generic Model Definition


### DBSCAN

Insert the hand motion data into the DBSCAN algorithm, output the resulting clusters

In [None]:
db = DBSCAN(eps=0.9, min_samples=10)
db.fit(X_train)

# https://www.geeksforgeeks.org/dbscan-clustering-in-ml-density-based-clustering/
labels = db.labels_
#labels = y_train
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
print("Labels:", set(labels))

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

# Plot result
# Black removed and is used for noise instead.
unique_labels = set(labels)

colors = distinctipy.get_colors(len(unique_labels)-1)
noise_counter = 0
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        noise_counter += 1
        col = 'k'

    class_member_mask = (labels == k)

    xy = X_train[class_member_mask & core_samples_mask].values
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k',
             markersize=6)

    xy = X_train[class_member_mask & ~core_samples_mask].values
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k',
             markersize=6,
             label=f'Class: {k}')

print("Noise (labeled -1):", noise_counter)
plt.title('number of clusters: %d' % n_clusters_)
plt.legend(bbox_to_anchor=(1.04, 0.5), loc="center left", borderaxespad=0)
plt.show()

### NLP

In [None]:
# 1. Load the Dataset
dataset = load_dataset("imdb").shuffle(seed=1337)

# 2. Tokenize the Dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Rename columns for compatibility with Hugging Face Trainer
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# Set the format for PyTorch
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 3. Split Dataset
train_dataset = tokenized_dataset["train"].select(range(dataset_samples))
eval_dataset = tokenized_dataset["test"].select(range(dataset_samples))

print(train_dataset)
print(eval_dataset)

## Setting up the BERT Model

In [None]:
# 4. Load Pre-trained BERT Model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 5. Set Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    report_to="none"  # Disable wandb integration
)

# 6. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# 7. Train the Model
# trainer.train()

# 8. Evaluate the Model
# trainer.evaluate()

# 9. Save the Model
# trainer.save_model("./bert-finetuned-imdb")
# tokenizer.save_pretrained("./bert-finetuned-imdb")

# 10. Inference
# from transformers import pipeline

# nlp = pipeline("text-classification", model="./bert-finetuned-imdb", tokenizer="./bert-finetuned-imdb")
# result = nlp("This movie was fantastic! The characters and plot were so engaging.")
# print(result)

## FreeSel Functions

In [None]:
def distance_based_selection(semantic_patterns, b, distance_function):
    """
    Implements Algorithm 2: Distance-based Selection.

    Args:
        semantic_patterns (dict): Dictionary where keys are image IDs, and values are lists of semantic patterns
                                  (e.g., feature vectors) for each image.
        b (int): Total annotation budget size, i.e., the number of images to select.
        distance_function (callable): A function to compute the distance between two semantic patterns.

    Returns:
        list: Selected image pool `S_I`.
    """
    # Initialize S_I with a random image I_0
    selected_images = set()
    all_image_ids = list(semantic_patterns.keys())
    I_0 = random.choice(all_image_ids)
    selected_images.add(I_0)

    # Initialize S_K with all semantic patterns in I_0
    selected_patterns = set(tuple(pattern) for pattern in semantic_patterns[I_0])  # Convert patterns to tuples for hashing

    while len(selected_images) < b:
        # Sample the next semantic pattern μ_j^I with the distance-based probability
        distances = []
        image_pattern_map = []

        for image_id, patterns in semantic_patterns.items():
            if image_id in selected_images:
                continue
            for pattern in patterns:
                # Calculate the minimum squared distance to S_K
                min_distance = min(distance_function(np.array(pattern), np.array(selected_pattern)) ** 2
                                   for selected_pattern in selected_patterns)
                distances.append(min_distance)
                image_pattern_map.append((image_id, tuple(pattern)))

        # Normalize distances into probabilities
        probabilities = np.array(distances) / np.sum(distances)

        # Sample a semantic pattern based on probabilities
        sampled_index = np.random.choice(len(probabilities), p=probabilities)
        sampled_image, sampled_pattern = image_pattern_map[sampled_index]

        # Add the image containing the sampled pattern to S_I
        selected_images.add(sampled_image)

        # Add all semantic patterns in the selected image to S_K
        selected_patterns.update(tuple(pattern) for pattern in semantic_patterns[sampled_image])

    return list(selected_images)


In [None]:
def get_similarity_matrix(attention_map, cls_attention, tau=0.5, d0=2):
    """
    Generate a similarity matrix for text tokens using attention scores.

    Parameters:
    - attention_map (torch.Tensor): Token-to-token attention scores (seq_len x seq_len).
    - cls_attention (torch.Tensor): [CLS] token attention scores (seq_len).
    - tau (float): Maintenance ratio for filtering important regions.
    - d0 (int): Neighborhood threshold (max token distance).

    Returns:
    - similarity_matrix (numpy.ndarray): Filtered similarity matrix.
    """
    # Step 1: Select top tokens based on CLS attention
    sorted_indices = torch.argsort(cls_attention, descending=True)
    cumulative_attention = torch.cumsum(cls_attention[sorted_indices], dim=0)
    t = torch.sum(cumulative_attention <= tau).item() + 1
    selected_indices = sorted_indices[:t]

    # Step 2: Initialize the similarity matrix with zeros
    seq_len = len(cls_attention)
    similarity_matrix = np.zeros((seq_len, seq_len))

    # Step 2: Filter attention map
    filtered_attention = attention_map[selected_indices][:, selected_indices].numpy()

    # Step 3: Apply neighborhood constraint
    seq_len = len(cls_attention)
    # similarity_matrix = np.zeros_like(filtered_attention)
    for i, token1 in enumerate(selected_indices):
        for j, token2 in enumerate(selected_indices):
            if abs(token1 - token2) <= d0:
                similarity_matrix[i, j] = filtered_attention[i, j]

    similarity_matrix = (similarity_matrix + similarity_matrix.T) / 2

    return similarity_matrix



In [None]:
def spectral_clustering(data, num_clusters=1):
    # return data

    # Reshape the attention map to 2D
    # nsamples, nx, ny = data.shape
    # resized_data = data.reshape((nsamples, nx * ny))

    sc = SpectralClustering(
        n_clusters=num_clusters,
        affinity='precomputed', #"nearest_neighbors",
        n_neighbors=5,
        random_state=1337,
    )
    sc.fit(data)  # Fit SpectralClustering to the reshaped data

    # Return cluster labels instead of resized_data
    return sc.labels_

def euclidean_distance(vec1, vec2):
    return np.linalg.norm(vec1 - vec2)

def get_cls_attention_map(model, inputs):
    """
    Extract attention maps and CLS attention from the pretrained transformer model.

    Parameters:
    - model: Pretrained transformer (e.g., BERT).
    - inputs: Tokenized input containing input IDs and attention masks.

    Returns:
    - patch_to_patch_attention: Attention scores between tokens (seq_len x seq_len).
    - cls_attention_avg: Average attention scores from [CLS] token (seq_len).
    """
    input_ids = inputs['input_ids'].unsqueeze(0)            # Token IDs
    attention_mask = inputs['attention_mask'].unsqueeze(0)  # Attention mask

    # Get outputs from BERT
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_attentions=True)

    # Extract last layer attention maps
    last_layer_attention = outputs.attentions[-1]                               # Shape: (batch_size, num_heads, seq_len, seq_len)
    patch_to_patch_attention = last_layer_attention.mean(dim=1).squeeze(0)      # Avg over heads
    cls_attention_avg = last_layer_attention[:, :, 0, :].mean(dim=1).squeeze(0) # [CLS] token attention

    return patch_to_patch_attention, cls_attention_avg

def freesel(dataset, pretrained_transformer, tokenizer, tau, K, b):
    """
    FreeSel pipeline for text-based datasets.

    Parameters:
    - dataset (list): List of text samples.
    - pretrained_transformer: Pretrained transformer model (e.g., BERT).
    - tokenizer: Tokenizer corresponding to the transformer model.
    - tau (float): Maintenance ratio for filtering.
    - K (int): Number of clusters (semantic patterns).
    - b (int): Total annotation budget size.

    Returns:
    - selected_texts (list): List of selected text samples based on clustering.
    """
    semantic_pool = defaultdict(list)

    for i, inputs in tqdm(enumerate(dataset), total=len(dataset), desc="Processing"):
        # Extract attention maps
        attention_map, cls_attention = get_cls_attention_map(pretrained_transformer, inputs)
        # Compute similarity matrix
        similarity_matrix = get_similarity_matrix(attention_map, cls_attention, tau=tau)
        # Store spectral clusters in semantic pool
        semantic_pool[i] = [spectral_clustering(similarity_matrix, K).tolist()]

    img_pool = distance_based_selection(semantic_pool, b=b, distance_function=euclidean_distance)

    return img_pool

## Running FreeSel

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name, output_attentions=True)

# Apply the FreeSel algorithm
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message="Graph is not fully connected, spectral embedding may not work as expected.")
    img_pool = freesel(train_dataset, bert_model, tokenizer, tau=0.5, K=3, b=5)

simplified_train_dataset = train_dataset.select(img_pool)

## Task Model

In [None]:
def task_model(simplified_train_dataset, eval_dataset):
    train_dataloader = DataLoader(simplified_train_dataset, shuffle=True, batch_size=16)
    val_dataloader = DataLoader(eval_dataset, batch_size=16)

    # Define the model for binary classification (num_labels=2) positive/negative review
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

    device = "cpu"
    model.to(device)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=5e-5)
    num_training_steps = len(train_dataloader) * 3  # Assuming 3 epochs
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )

    # Training Loop
    epochs = 3
    progress_bar = tqdm(range(num_training_steps))

    for epoch in range(epochs):
        model.train()
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        # Validation step
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in val_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                val_loss += outputs.loss.item()  # Accumulate validation loss

                # Get predictions
                preds = torch.argmax(outputs.logits, dim=-1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(batch["labels"].cpu().numpy())

        avg_val_loss = val_loss / len(val_dataloader)
        val_accuracy = accuracy_score(all_labels, all_preds)

        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Validation Loss: {avg_val_loss:.4f}")
        print(f"Validation Accuracy: {val_accuracy:.4f}")
        print(classification_report(all_labels, all_preds))

## Experimentation

In [None]:
for tau in tau_experiment:
  for k in k_experiment:
    for b in b_experiment:
      with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="Graph is not fully connected, spectral embedding may not work as expected.")
        start_time = time.time()
        img_pool = freesel(train_dataset, bert_model, tokenizer, tau=tau, K=k, b=int(b))
        end_time = time.time()

      simplified_train_dataset = train_dataset.select(img_pool)
      task_model(simplified_train_dataset, eval_dataset)
      print(f"Sample size: {dataset_samples}, tau={tau}, K={k}, b={int(b)}")
      print(f"Time taken in Freesel: {end_time - start_time:.2f} seconds")
      print("------------------------------------")