In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir('/content/drive/MyDrive/KDSH_2025_Dataset/Reference')

In [None]:
!pip install pdfplumber --quiet

In [None]:
import re
def clean_text(text):
  text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
  text = re.sub(r'^\s+|\s+?$', '', text)  # Remove leading/trailing spaces
  text = re.sub(r'Page \d+', '', text) # Remove "Page X" type patterns
  text = re.sub(r'^[A-Za-z0-9,\s]+$', '', text)  # Remove author names or titles in the header
  return text

In [None]:
import os
import pdfplumber

def read_pdfs_from_folder(folder_path):
    # Dictionary to store the extracted text
    extracted_info = {"Publishable": [], "Non-Publishable": []}

    # Loop through each subfolder ('Publishable' and 'Non-Publishable')
    for subfolder in ['Non-Publishable', 'Publishable']:
        subfolder_path = os.path.join(folder_path, subfolder)

        if os.path.exists(subfolder_path):
            # Recursively walk through all directories and files inside the subfolder
            for root, dirs, files in os.walk(subfolder_path):
                pdf_files = [f for f in files if f.endswith('.pdf')]

                # Loop through each PDF file found in the current directory
                for pdf_file in pdf_files:
                    file_path = os.path.join(root, pdf_file)

                    # Open the PDF and extract the text
                    with pdfplumber.open(file_path) as pdf:
                        text = ""
                        for page in pdf.pages:
                            text += page.extract_text()

                        # Clean the extracted text before appending
                        extracted_info[subfolder].append(clean_text(text))
        else:
            print(f"No Folder named {subfolder}")

    return extracted_info

In [None]:
data = read_pdfs_from_folder('/content/drive/MyDrive/KDSH_2025_Dataset/Reference')
data

{'Publishable': ['Detailed Action Identification in Baseball Game Recordings Abstract ThisresearchintroducesMLB-YouTube,anewandcomplexdatasetcreatedfor nuanced activity recognition in baseball videos. This dataset is structured to supporttwotypesofanalysis: oneforclassifyingactivitiesinsegmentedvideos andanotherfordetectingactivitiesinunsegmented,continuousvideostreams. This studyevaluatesseveralmethodsforrecognizingactivities,focusingonhowthey capture the temporal organization of activities in videos. This evaluation starts with categorizing segmented videos and progresses to applying these methods tocontinuousvideofeeds. Additionally,thispaperassessestheeffectivenessof different models in the challenging task of forecasting pitch velocity and type usingbaseballbroadcastvideos. Thefindingsindicatethatincorporatingtemporal dynamicsintomodelsisbeneficialfordetailedactivityrecognition. 1 Introduction Actionrecognition,asignificantproblemincomputervision,findsextensiveuseinsports. Profes-

In [None]:
def read_test_pdfs_from_folder(folder_path):
    extracted_info = {}
    for subfolder in ['Papers']:
        subfolder_path = os.path.join(folder_path, subfolder)
        if os.path.exists(subfolder_path):
            for root, dirs, files in os.walk(subfolder_path):
                pdf_files = [f for f in files if f.endswith('.pdf')]
                for pdf_file in pdf_files:
                    doc_name = os.path.splitext(pdf_file)[0]  # Remove the '.pdf' extension
                    file_path = os.path.join(root, pdf_file)

                    with pdfplumber.open(file_path) as pdf:
                        text = ""
                        for page in pdf.pages:
                            text += page.extract_text()

                    extracted_info[doc_name] = clean_text(text)  # Save the extracted text with document name as key
        else:
            print(f"No Folder named {subfolder}")

    return extracted_info

In [None]:
test = read_test_pdfs_from_folder('/content/drive/MyDrive/KDSH_2025_Dataset')
len(test)

135

In [None]:
data.keys()

dict_keys(['Publishable', 'Non-Publishable'])

In [None]:
!pip install transformers datasets --quiet

In [None]:
# Define a custom dataset for PyTorch
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Convert the dictionary into a format suitable for training
texts = data["Publishable"] + data["Non-Publishable"]
labels = [1] * len(data["Publishable"]) + [0] * len(data["Non-Publishable"])

# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm

# Load the pre-trained RoBERTa tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create train and validation datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)

# Load the pre-trained RoBERTa model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set the device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Set the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
def train(model, train_loader, val_loader, optimizer, device, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        correct_train_preds = 0  # For training accuracy
        tester = []
        predicted = []

        # Training phase
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()

            # Move data to the GPU/CPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

            # Calculate training accuracy
            preds = torch.argmax(logits, dim=-1)
            predicted.append(preds.detach().to('cpu'))
            tester.append(labels.detach().to('cpu'))
            correct_train_preds += (preds == labels).sum().item()

        # Validation phase
        model.eval()
        total_val_loss = 0
        correct_val_preds = 0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits

                total_val_loss += loss.item()

                # Calculate validation accuracy
                preds = torch.argmax(logits, dim=-1)
                predicted.append(preds.detach().to('cpu'))
                tester.append(labels.detach().to('cpu'))
                correct_val_preds += (preds == labels).sum().item()

        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)
        train_accuracy = correct_train_preds / len(train_texts)  # Calculate training accuracy
        val_accuracy = correct_val_preds / len(val_texts)  # Calculate validation accuracy

        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Training Loss: {avg_train_loss:.4f}")
        print(f"Training Accuracy: {train_accuracy:.4f}")
        print(f"Validation Loss: {avg_val_loss:.4f}")
        print(f"Validation Accuracy: {val_accuracy:.4f}")
        print(f"Classification Report:\n{classification_report(tester, predicted)}")
        print(f"Confusion Matrix:\n{confusion_matrix(tester, predicted)}")

# Train the model
train(model, train_loader, val_loader, optimizer, device, epochs=20)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/20: 100%|██████████| 12/12 [00:06<00:00,  1.97it/s]
Validation: 100%|██████████| 3/3 [00:01<00:00,  2.86it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, m

Epoch 1/20
Training Loss: 0.7310
Training Accuracy: 0.6667
Validation Loss: 0.6876
Validation Accuracy: 0.6667
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.67      1.00      0.80        10

    accuracy                           0.67        15
   macro avg       0.33      0.50      0.40        15
weighted avg       0.44      0.67      0.53        15

Confusion Matrix:
[[ 0  5]
 [ 0 10]]


Training Epoch 2/20: 100%|██████████| 12/12 [00:04<00:00,  2.88it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.86it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2/20
Training Loss: 0.6663
Training Accuracy: 0.6667
Validation Loss: 0.6234
Validation Accuracy: 0.6667
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.67      1.00      0.80        10

    accuracy                           0.67        15
   macro avg       0.33      0.50      0.40        15
weighted avg       0.44      0.67      0.53        15

Confusion Matrix:
[[ 0  5]
 [ 0 10]]


Training Epoch 3/20: 100%|██████████| 12/12 [00:04<00:00,  2.90it/s]
Validation: 100%|██████████| 3/3 [00:01<00:00,  2.85it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/20
Training Loss: 0.6284
Training Accuracy: 0.6667
Validation Loss: 0.5948
Validation Accuracy: 0.6667
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.67      1.00      0.80        10

    accuracy                           0.67        15
   macro avg       0.33      0.50      0.40        15
weighted avg       0.44      0.67      0.53        15

Confusion Matrix:
[[ 0  5]
 [ 0 10]]


Training Epoch 4/20: 100%|██████████| 12/12 [00:05<00:00,  2.20it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.97it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/20
Training Loss: 0.6101
Training Accuracy: 0.6667
Validation Loss: 0.5750
Validation Accuracy: 0.6667
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.67      1.00      0.80        10

    accuracy                           0.67        15
   macro avg       0.33      0.50      0.40        15
weighted avg       0.44      0.67      0.53        15

Confusion Matrix:
[[ 0  5]
 [ 0 10]]


Training Epoch 5/20: 100%|██████████| 12/12 [00:04<00:00,  2.90it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.84it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/20
Training Loss: 0.5768
Training Accuracy: 0.6667
Validation Loss: 0.5005
Validation Accuracy: 0.6667
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.67      1.00      0.80        10

    accuracy                           0.67        15
   macro avg       0.33      0.50      0.40        15
weighted avg       0.44      0.67      0.53        15

Confusion Matrix:
[[ 0  5]
 [ 0 10]]


Training Epoch 6/20: 100%|██████████| 12/12 [00:05<00:00,  2.31it/s]
Validation: 100%|██████████| 3/3 [00:01<00:00,  2.49it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/20
Training Loss: 0.5333
Training Accuracy: 0.6667
Validation Loss: 0.4996
Validation Accuracy: 0.6667
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.67      1.00      0.80        10

    accuracy                           0.67        15
   macro avg       0.33      0.50      0.40        15
weighted avg       0.44      0.67      0.53        15

Confusion Matrix:
[[ 0  5]
 [ 0 10]]


Training Epoch 7/20: 100%|██████████| 12/12 [00:04<00:00,  2.91it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.88it/s]


Epoch 7/20
Training Loss: 0.4159
Training Accuracy: 0.8333
Validation Loss: 0.5640
Validation Accuracy: 0.6667
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.40      0.57         5
           1       0.77      1.00      0.87        10

    accuracy                           0.80        15
   macro avg       0.88      0.70      0.72        15
weighted avg       0.85      0.80      0.77        15

Confusion Matrix:
[[ 2  3]
 [ 0 10]]


Training Epoch 8/20: 100%|██████████| 12/12 [00:04<00:00,  2.89it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.37it/s]


Epoch 8/20
Training Loss: 0.3723
Training Accuracy: 0.8333
Validation Loss: 0.3384
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.83      1.00      0.91        10

    accuracy                           0.87        15
   macro avg       0.92      0.80      0.83        15
weighted avg       0.89      0.87      0.86        15

Confusion Matrix:
[[ 3  2]
 [ 0 10]]


Training Epoch 9/20: 100%|██████████| 12/12 [00:05<00:00,  2.14it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.88it/s]


Epoch 9/20
Training Loss: 0.2974
Training Accuracy: 0.9167
Validation Loss: 0.3075
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       0.91      1.00      0.95        10

    accuracy                           0.93        15
   macro avg       0.95      0.90      0.92        15
weighted avg       0.94      0.93      0.93        15

Confusion Matrix:
[[ 4  1]
 [ 0 10]]


Training Epoch 10/20: 100%|██████████| 12/12 [00:04<00:00,  2.87it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.76it/s]


Epoch 10/20
Training Loss: 0.2175
Training Accuracy: 1.0000
Validation Loss: 0.1933
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        10

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Confusion Matrix:
[[ 5  0]
 [ 0 10]]


Training Epoch 11/20: 100%|██████████| 12/12 [00:04<00:00,  2.51it/s]
Validation: 100%|██████████| 3/3 [00:01<00:00,  2.20it/s]


Epoch 11/20
Training Loss: 0.1627
Training Accuracy: 1.0000
Validation Loss: 0.2382
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        10

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Confusion Matrix:
[[ 5  0]
 [ 0 10]]


Training Epoch 12/20: 100%|██████████| 12/12 [00:04<00:00,  2.83it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.80it/s]


Epoch 12/20
Training Loss: 0.1485
Training Accuracy: 0.9167
Validation Loss: 0.1141
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       0.91      1.00      0.95        10

    accuracy                           0.93        15
   macro avg       0.95      0.90      0.92        15
weighted avg       0.94      0.93      0.93        15

Confusion Matrix:
[[ 4  1]
 [ 0 10]]


Training Epoch 13/20: 100%|██████████| 12/12 [00:04<00:00,  2.90it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.89it/s]


Epoch 13/20
Training Loss: 0.0863
Training Accuracy: 1.0000
Validation Loss: 0.0979
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        10

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Confusion Matrix:
[[ 5  0]
 [ 0 10]]


Training Epoch 14/20: 100%|██████████| 12/12 [00:06<00:00,  1.97it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.90it/s]


Epoch 14/20
Training Loss: 0.0793
Training Accuracy: 1.0000
Validation Loss: 0.0957
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        10

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Confusion Matrix:
[[ 5  0]
 [ 0 10]]


Training Epoch 15/20: 100%|██████████| 12/12 [00:04<00:00,  2.87it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.86it/s]


Epoch 15/20
Training Loss: 0.0498
Training Accuracy: 1.0000
Validation Loss: 0.0791
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        10

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Confusion Matrix:
[[ 5  0]
 [ 0 10]]


Training Epoch 16/20: 100%|██████████| 12/12 [00:05<00:00,  2.31it/s]
Validation: 100%|██████████| 3/3 [00:01<00:00,  2.42it/s]


Epoch 16/20
Training Loss: 0.0351
Training Accuracy: 1.0000
Validation Loss: 0.0488
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        10

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Confusion Matrix:
[[ 5  0]
 [ 0 10]]


Training Epoch 17/20: 100%|██████████| 12/12 [00:04<00:00,  2.78it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.97it/s]


Epoch 17/20
Training Loss: 0.0286
Training Accuracy: 1.0000
Validation Loss: 0.0380
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        10

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Confusion Matrix:
[[ 5  0]
 [ 0 10]]


Training Epoch 18/20: 100%|██████████| 12/12 [00:04<00:00,  2.86it/s]
Validation: 100%|██████████| 3/3 [00:01<00:00,  2.89it/s]


Epoch 18/20
Training Loss: 0.0232
Training Accuracy: 1.0000
Validation Loss: 0.0330
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        10

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Confusion Matrix:
[[ 5  0]
 [ 0 10]]


Training Epoch 19/20: 100%|██████████| 12/12 [00:05<00:00,  2.28it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.95it/s]


Epoch 19/20
Training Loss: 0.0177
Training Accuracy: 1.0000
Validation Loss: 0.0289
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        10

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Confusion Matrix:
[[ 5  0]
 [ 0 10]]


Training Epoch 20/20: 100%|██████████| 12/12 [00:04<00:00,  2.90it/s]
Validation: 100%|██████████| 3/3 [00:00<00:00,  3.93it/s]

Epoch 20/20
Training Loss: 0.0171
Training Accuracy: 1.0000
Validation Loss: 0.0259
Validation Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        10

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Confusion Matrix:
[[ 5  0]
 [ 0 10]]





In [None]:
# Prediction function that takes the extracted documents and predicts their publishability
def predict_publishability(model, tokenizer, extracted_info, device):
    model.eval()  # Set the model to evaluation mode
    predictions = []

    for doc_name, text in extracted_info.items():
        # Tokenize the input text
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Make predictions
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).item()  # Get predicted label (0 or 1)

        # Append document name and prediction to the predictions list
        predictions.append((doc_name, preds))

    # Return predictions
    return predictions

In [None]:
os.chdir('/content/')

In [None]:
import pandas as pd

In [None]:
def save_predictions_to_csv(predictions, filename="predictions.csv"):
    # Convert the predictions list to a pandas DataFrame
    df = pd.DataFrame(predictions, columns=["Document Name", "Prediction"])

    # Save the DataFrame as a CSV file
    df.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

In [None]:
predictions = predict_publishability(model, tokenizer, test, device)
predictions

[('P062', 1),
 ('P126', 1),
 ('P068', 1),
 ('P008', 1),
 ('P064', 1),
 ('P121', 1),
 ('P018', 1),
 ('P024', 1),
 ('P116', 1),
 ('P090', 1),
 ('P041', 0),
 ('P016', 1),
 ('P038', 1),
 ('P081', 0),
 ('P132', 1),
 ('P130', 0),
 ('P078', 1),
 ('P022', 0),
 ('P026', 0),
 ('P033', 1),
 ('P125', 1),
 ('P133', 1),
 ('P028', 1),
 ('P118', 1),
 ('P048', 1),
 ('P039', 1),
 ('P069', 0),
 ('P070', 1),
 ('P077', 1),
 ('P134', 0),
 ('P047', 0),
 ('P027', 1),
 ('P129', 1),
 ('P105', 0),
 ('P043', 0),
 ('P036', 0),
 ('P032', 0),
 ('P073', 0),
 ('P097', 0),
 ('P086', 0),
 ('P094', 0),
 ('P128', 1),
 ('P104', 1),
 ('P031', 1),
 ('P102', 1),
 ('P002', 0),
 ('P080', 1),
 ('P100', 1),
 ('P053', 0),
 ('P023', 1),
 ('P056', 1),
 ('P066', 1),
 ('P096', 0),
 ('P119', 1),
 ('P075', 1),
 ('P050', 1),
 ('P074', 1),
 ('P007', 1),
 ('P045', 1),
 ('P013', 1),
 ('P060', 1),
 ('P110', 1),
 ('P087', 1),
 ('P005', 1),
 ('P067', 1),
 ('P017', 1),
 ('P029', 1),
 ('P051', 1),
 ('P071', 1),
 ('P108', 1),
 ('P123', 1),
 ('P00

In [None]:
save_predictions_to_csv(predictions, filename="predictions.csv")

Predictions saved to predictions.csv
