In [1]:
import os
import re
import torch
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import bibtexparser
from bibtexparser.bparser import BibTexParser

In [2]:
saved_model_dir = "./SavedModels/"

In [3]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [4]:
eval_dataset = pd.read_csv("./data/title_abstract_eval_dataset.csv")
test_encodings = tokenizer(eval_dataset['Title_and_Abstract'].astype(str).tolist(), truncation=True, padding=True, max_length=512)

In [5]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])


In [6]:
test_dataset = TestDataset(test_encodings)

In [7]:
# Create DataLoaders for train, validation, and test
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [8]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Choose device available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def classify_papers(model, dataloader, device, tokenizer):
    """Classify papers using the trained model and return a DataFrame with texts and their classifications."""
    model.eval()
    predictions = []
    texts = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            batch_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in batch['input_ids']]
            texts.extend(batch_texts)
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())

    # Create DataFrame
    results_df = pd.DataFrame({
        'Title_and_Abstract': texts,
        'Accepted_for_Full_Text': predictions
    })

    return results_df

## Set Seed for Reproducibility

In [10]:
def set_seed(seed=42):
    """Set all random seeds to a fixed value and possibly disable nondeterministic algorithms."""
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [11]:
set_seed()

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,  # Ensure this matches the setup during training
    output_attentions=False,
    output_hidden_states=False,
)

# Step 2: Load the saved model weights
model_path = saved_model_dir+'DistilBertForSequenceClassification_best_model.pth'
model.load_state_dict(torch.load(model_path))

model.to(device)
model.eval()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [12]:
results_df = classify_papers(model, test_dataloader, device, tokenizer)

In [13]:
results_df.tail(10)

Unnamed: 0,Title_and_Abstract,Accepted_for_Full_Text
10493,fog based integrated nutrient management syste...,0
10494,an automated irrigation system for agriculture...,1
10495,using a compositional function hybridization o...,1
10496,a study on secure network slicing in 5g,0
10497,advancing cattle welfare : ultra low - power h...,1
10498,wireless sensor network based greenhouse monit...,0
10499,agrostring 2. 0 : a distributed - ledger based...,0
10500,performance of routing protocol for low - powe...,0
10501,churn - tolerant leader election protocols,0
10502,holistic technologies for managing internet of...,0


In [14]:
relevant_df = results_df[results_df['Accepted_for_Full_Text'] == 1]
not_relevant_df = results_df[results_df['Accepted_for_Full_Text'] == 0]

In [15]:
relevant_df.shape, not_relevant_df.shape

((2967, 2), (7536, 2))

In [16]:
relevant_df

Unnamed: 0,Title_and_Abstract,Accepted_for_Full_Text
12,novel method for crop growth tracking with dee...,1
21,aiot in agriculture : safeguarding crops from ...,1
23,artificial intelligence in the development of ...,1
29,machine learning ( ml ) algorithms on iot and ...,1
30,conceptualizing a holistic smart dairy farming...,1
...,...,...
10491,iot based precision agri - bot,1
10492,a fog - based smart agriculture system to dete...,1
10494,an automated irrigation system for agriculture...,1
10495,using a compositional function hybridization o...,1


In [21]:
def parse_bib_file_to_dict(file_path):
    """Parse a BibTeX file and return a list of dictionaries with combined title and abstract and full entry data."""
    with open(file_path, encoding='utf-8') as bibtex_file:
        parser = BibTexParser(common_strings=True)
        bib_database = bibtexparser.load(bibtex_file, parser=parser)

    entries_list = []
    for entry in bib_database.entries:
        title = entry.get('title', '').replace('{', '').replace('}', '').replace('\n', ' ').strip()
        abstract = entry.get('abstract', '').replace('{', '').replace('}', '').replace('\n', ' ').strip()
        text_feature = f"{title} {abstract}".strip()
        entry_dict = {
            'combined_text': text_feature,
            'full_entry': entry  # Store the whole entry for later use
        }
        entries_list.append(entry_dict)

    return entries_list

def normalize_and_process_text(texts, tokenizer):
    """Process texts using a tokenizer and decode them back to strings."""
    tokenized_texts = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
    decoded_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in tokenized_texts.input_ids]
    return decoded_texts

def find_matches(predicted_texts, processed_extracted_texts, full_entries):
    matched_entries = []
    unmatched_texts = []

    for pred_text in predicted_texts:
        if pred_text in processed_extracted_texts:
            index = processed_extracted_texts.index(pred_text)
            matched_entries.append(full_entries[index]['full_entry'])
        else:
            unmatched_texts.append(pred_text)

    return matched_entries, unmatched_texts


def entry_to_bibtex(entry):
    """Convert a dictionary entry to a BibTeX-formatted string."""
    bibtex_entry = f"@{entry['ENTRYTYPE']}{{{entry['ID']},\n"
    for key, value in entry.items():
        if key not in ['ENTRYTYPE', 'ID']:  # Skip type and ID as they are already used
            bibtex_entry += f"  {key} = {{{value}}},\n"
    bibtex_entry += "}\n"
    return bibtex_entry


def save_matched_entries_to_bib(matched_entries, filename):
    """Save matched entries to a BibTeX file."""
    with open(filename, 'w', encoding='utf-8') as file:
        for entry in matched_entries:
            bibtex_string = entry_to_bibtex(entry)
            file.write(bibtex_string)


## Getting the Relevant Papers After the Model Prediction

In [22]:
original_bib_file_dir = "data/Papers_after_duplicates.bib"
extracted_texts = [entry['combined_text'] for entry in parse_bib_file_to_dict(original_bib_file_dir)]
processed_extracted_texts = normalize_and_process_text(extracted_texts, tokenizer)

In [23]:
rl_predicted_titles_and_abstracts = relevant_df['Title_and_Abstract'].astype(str).tolist()

In [None]:
full_entries = parse_bib_file_to_dict(original_bib_file_dir)
processed_full_texts = normalize_and_process_text([entry['combined_text'] for entry in full_entries], tokenizer)

In [None]:
rl_matched_full_entries, rl_unmatched_predicted_texts = find_matches(rl_predicted_titles_and_abstracts, processed_full_texts, full_entries)

In [None]:
rl_matched_full_entries[:5]

In [None]:
save_matched_entries_to_bib(rl_matched_full_entries, 'relevant_papers_title_and_abstract_screened.bib')

## Getting the Irrelevant Papers After the Model Prediction

In [None]:
nrl_predicted_titles_and_abstracts = non_relevant_df['Title_and_Abstract'].astype(str).tolist()

In [None]:
nrl_matched_full_entries, nrl_unmatched_predicted_texts = find_matches(nrl_predicted_titles_and_abstracts, processed_full_texts, full_entries)

In [None]:
save_matched_entries_to_bib(rl_matched_full_entries, 'relevant_papers_title_and_abstract_screened.bib')