In [1]:
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification
import pandas as pd

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
import logging
from transformers import AutoTokenizer

logging.getLogger("mlflow").setLevel(logging.ERROR)

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device : {device}")
categories = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology",
              "Quantitative Finance"]

device : cuda


In [3]:
val_df = pd.read_csv("data/val_cleaned.csv")
total_samples = len(val_df)
categories_distribution = val_df[categories].sum()
categories_distribution

Computer Science        1719
Physics                 1203
Mathematics             1124
Statistics              1041
Quantitative Biology     117
Quantitative Finance      50
dtype: int64

In [4]:
def combine_labels(batch):
    label_lists = [batch[cat] for cat in categories]
    numpy_array = np.array(label_lists)
    transpose = numpy_array.T
    labels = transpose.tolist()
    return {
        "labels": labels,
    }

In [5]:
from datasets import load_dataset, Sequence, Value

val_dataset = load_dataset("csv", data_files="data/val_cleaned.csv")["train"]
val_dataset = val_dataset.map(combine_labels, batched=True, batch_size=64)
new_features = val_dataset.features.copy()
new_features["labels"] = Sequence(Value("float32"))
val_dataset = val_dataset.cast(new_features)

In [6]:
trained_model = AutoModelForSequenceClassification.from_pretrained("./best_model/distill_bert_layer_5/fine_tune_pos_smoothed/model")
trained_model.eval()
saved_tokenizer = AutoTokenizer.from_pretrained(f"./best_model/distill_bert_layer_5/fine_tune_pos_smoothed/tokenizer")


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

In [28]:
fine_tuned_thresholds = pd.read_csv("best_model/distill_bert_layer_5/fine_tune_pos_smoothed/fine_tune_threshold.csv")
thresholds_tensor = torch.tensor(fine_tuned_thresholds.values, dtype=torch.float32)
thresholds_tensor = thresholds_tensor.squeeze().unsqueeze(0)
thresholds_tensor.shape

torch.Size([1, 6])

In [12]:
from torch.nn import BCEWithLogitsLoss
def get_loss(batch):
    inputs = {k:torch.tensor(v) for k,v in batch.items()
              if k in saved_tokenizer.model_input_names}

    with torch.no_grad():
        output = trained_model(**inputs)
        logits = output.logits.view(-1)
        labels = batch["labels"]
        pred_label = (torch.sigmoid(torch.tensor(logits)) > thresholds_tensor).int().numpy()
        loss = BCEWithLogitsLoss(logits, labels, reduction="none")
    return {"loss": loss,
            "predicted_label": pred_label}

In [10]:
tokenized_dataset = val_dataset.map(
        lambda batch: saved_tokenizer(
            batch["text"], 
            padding="max_length", 
            truncation=True,
        max_length=512), batched=True, batch_size=64)


In [1]:
from torch.nn import BCEWithLogitsLoss
criterion = BCEWithLogitsLoss(reduction="none")
def get_loss(batch):
    inputs = {k:torch.tensor(v) for k,v in batch.items()
              if k in saved_tokenizer.model_input_names}

    with torch.no_grad():
        output = trained_model(**inputs)
        logits = output.logits
        labels = torch.tensor(batch["labels"]).float()
        pred_label = (torch.sigmoid(torch.tensor(logits)) > thresholds_tensor).int().numpy()
        loss = criterion(logits, labels)
    return {"loss": loss,
            "predicted_label": pred_label}

loss_dataset = tokenized_dataset.map(
    get_loss, batched=True, batch_size=64)

loss_dataset.set_format("pandas")


NameError: name 'tokenized_dataset' is not defined

In [38]:
loss_dataset.features

{'text': Value('large_string'),
 'Computer Science': Value('int64'),
 'Physics': Value('int64'),
 'Mathematics': Value('int64'),
 'Statistics': Value('int64'),
 'Quantitative Biology': Value('int64'),
 'Quantitative Finance': Value('int64'),
 'labels': List(Value('float32')),
 'input_ids': List(Value('int32')),
 'token_type_ids': List(Value('int8')),
 'attention_mask': List(Value('int8')),
 'loss': List(Value('float32')),
 'predicted_label': List(Value('int32'))}

In [39]:
cols = ["text", "labels", "predicted_label", "loss"]
loss_df = loss_dataset[:][cols]
loss_df.head()

Unnamed: 0,text,labels,predicted_label,loss
0,Title: Reconstructing Subject-Specific Effect ...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[1, 0, 0, 1, 0, 0]","[0.796936, 0.044219017, 0.10037255, 2.3919926,..."
1,Title: Spherical polyharmonics and Poisson ker...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0, 0, 1, 0, 0, 0]","[0.025695324, 0.04735899, 0.010436806, 0.00893..."
2,Title: On maximizing the fundamental frequency...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0, 0, 1, 0, 0, 0]","[0.07015228, 0.035301685, 0.018718144, 0.01687..."
3,Title: On the rotation period and shape of the...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0, 1, 0, 0, 0, 0]","[0.01576376, 0.016030129, 0.019452572, 0.01573..."
4,Title: SPH calculations of Mars-scale collisio...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0, 1, 0, 0, 0, 0]","[0.010409832, 0.0050907466, 0.010399342, 0.005..."


In [40]:
def get_mean_loss(loss):
    loss = loss.mean()
    return loss
loss_df["mean_loss"] = loss_df["loss"].map(get_mean_loss)

In [41]:
loss_df.head()

Unnamed: 0,text,labels,predicted_label,loss,mean_loss
0,Title: Reconstructing Subject-Specific Effect ...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[1, 0, 0, 1, 0, 0]","[0.796936, 0.044219017, 0.10037255, 2.3919926,...",0.613495
1,Title: Spherical polyharmonics and Poisson ker...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0, 0, 1, 0, 0, 0]","[0.025695324, 0.04735899, 0.010436806, 0.00893...",0.01715
2,Title: On maximizing the fundamental frequency...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0, 0, 1, 0, 0, 0]","[0.07015228, 0.035301685, 0.018718144, 0.01687...",0.024373
3,Title: On the rotation period and shape of the...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0, 1, 0, 0, 0, 0]","[0.01576376, 0.016030129, 0.019452572, 0.01573...",0.013393
4,Title: SPH calculations of Mars-scale collisio...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0, 1, 0, 0, 0, 0]","[0.010409832, 0.0050907466, 0.010399342, 0.005...",0.007002


In [44]:
sorted_loss_df = loss_df.sort_values(by="mean_loss", ascending=False)
sorted_loss_df.head()

Unnamed: 0,text,labels,predicted_label,loss,mean_loss
530,Title: Density estimation on small datasets Ab...,"[1.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0, 0, 1, 1, 0, 0]","[2.0987656, 0.04851961, 1.4018383, 3.6892424, ...",1.935004
4167,Title: The Physics of Eccentric Binary Black H...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0, 1, 0, 0, 0, 0]","[4.854841, 5.4281616, 0.009446621, 0.007340908...",1.718461
321,Title: Large Magellanic Cloud Near-Infrared Sy...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0, 1, 0, 0, 0, 0]","[0.0053162575, 5.2972198, 0.010950565, 4.89094...",1.702543
2000,Title: A case study of hurdle and generalized ...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0, 1, 0, 0, 0, 0]","[0.008554935, 5.4563775, 0.008545876, 4.536570...",1.671222
1100,Title: Detecting Galaxy-Filament Alignments in...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0, 1, 0, 0, 0, 0]","[0.009424686, 5.3178744, 0.006726265, 4.54317,...",1.648524


In [52]:
for i in range(0, 10):
    row = sorted_loss_df.iloc[i]
    labels = row["labels"].astype(int)
    predicted_label = row["predicted_label"].astype(int)
    true_label = []
    p_labels = []
    for index in range(6):
        if labels[index] == 1:
            true_label.append(categories[index])
        if predicted_label[index] == 1:
            p_labels.append(categories[index])
    print(f"{row['text']}: True Labels: {true_label}: Predicted_labels: {p_labels}")
    print(f"{'='*50}")

Title: Density estimation on small datasets Abstract: How might a smooth probability distribution be estimated, with accurately quantified uncertainty, from a limited amount of sampled data? Here we describe a field-theoretic approach that addresses this problem remarkably well in one dimension, providing an exact nonparametric Bayesian posterior without relying on tunable parameters or large-data approximations. Strong non-Gaussian constraints, which require a non-perturbative treatment, are found to play a major role in reducing distribution uncertainty. A software implementation of this method is provided.: True Labels: ['Computer Science', 'Quantitative Biology']: Predicted_labels: ['Mathematics', 'Statistics']
Title: The Physics of Eccentric Binary Black Hole Mergers. A Numerical Relativity Perspective Abstract: Gravitational wave observations of eccentric binary black hole mergers will provide unequivocal evidence for the formation of these systems through dynamical assembly in d