In [109]:
import os
import re
import torch
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from fuzzywuzzy import fuzz

import bibtexparser
from bibtexparser.bparser import BibTexParser



In [2]:
saved_model_dir = "./SavedModels/"

In [3]:
train_val_dataset = pd.read_csv("./data/title_abstract_train_val_dataset.csv")
eval_dataset = pd.read_csv("./data/title_abstract_eval_dataset.csv")

In [4]:
train_val_dataset.tail(10)

Unnamed: 0,Title_and_Abstract,Accepted_for_Full_Text
990,Guest Editorial: An End-to-End Machine Learnin...,0
991,Guest Editorial Special Issue on Privacy and S...,0
992,Guest Editorial Special Issue on Emerging Tren...,0
993,Guest Editorial Special Issue on Advanced Cogn...,0
994,Enhancing Smart Agriculture Scenarios with Low...,1
995,High Voltage Discharge Exhibits Severe Effect ...,0
996,Heterogeneous GNN-RL-Based Task Offloading for...,1
997,Optimized Data Fusion With Scheduled Rest Peri...,0
998,FarmEdge: A Unified Edge Computing Framework E...,1
999,"5G Network: Architecture, Protocols, Challenge...",0


In [5]:
eval_dataset.head(5)

Unnamed: 0,Title_and_Abstract,Accepted_for_Full_Text
0,Influence of artificial intelligence (AI) on f...,0
1,Software engineering approaches for tinyml bas...,0
2,Artificial intelligence in practice: how 50 su...,0
3,Industry 4.0: Industrial internet of things (I...,0
4,Artificial intelligence and biological misuse:...,0


In [6]:
train_val_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Title_and_Abstract      996 non-null    object
 1   Accepted_for_Full_Text  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [7]:
eval_dataset.drop("Accepted_for_Full_Text", axis = 1, inplace=True)

In [8]:
eval_dataset

Unnamed: 0,Title_and_Abstract
0,Influence of artificial intelligence (AI) on f...
1,Software engineering approaches for tinyml bas...
2,Artificial intelligence in practice: how 50 su...
3,Industry 4.0: Industrial internet of things (I...
4,Artificial intelligence and biological misuse:...
...,...
10498,Wireless Sensor Network Based Greenhouse Monit...
10499,agroString 2.0: A Distributed-Ledger based Sma...
10500,Performance of Routing Protocol for Low-Power ...
10501,Churn-Tolerant Leader Election Protocols


In [9]:
train_data, val_data = train_test_split(train_val_dataset, test_size=0.2, random_state=42)
train_data.shape, val_data.shape

((800, 2), (200, 2))

In [10]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [11]:
train_encodings = tokenizer(train_data['Title_and_Abstract'].astype(str).tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_data['Title_and_Abstract'].astype(str).tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(eval_dataset['Title_and_Abstract'].astype(str).tolist(), truncation=True, padding=True, max_length=512)

In [12]:
class TrainValDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])


In [13]:
# Create the PyTorch datasets
train_dataset = TrainValDataset(train_encodings, train_data['Accepted_for_Full_Text'].tolist())
val_dataset = TrainValDataset(val_encodings, val_data['Accepted_for_Full_Text'].tolist())

test_dataset = TestDataset(test_encodings)

In [14]:
# Create DataLoaders for train, validation, and test
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [15]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Choose device available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def train_and_validate(model, train_dataloader, val_dataloader, optimizer, device, epochs=3, saved_models_path=saved_model_dir):
    """Train and validate the model.
    
    Args:
        model (torch.nn.Module): The model to train and validate.
        train_dataloader (DataLoader): DataLoader for training data.
        val_dataloader (DataLoader): DataLoader for validation data.
        optimizer (torch.optim.Optimizer): Optimizer for the model.
        device (torch.device): Device to run the model computation.
        epochs (int): Number of epochs to train the model.
        saved_models_path (str): Directory path where the model will be saved.
    """
    model = model.to(device)
    best_val_accuracy = 0

    # Get the model name from its class
    model_name = model.__class__.__name__

    total_steps = len(train_dataloader) * epochs
    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    for epoch in range(epochs):
        # Training Phase
        model.train()
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs} - Training"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.4f}")

        # Validation Phase
        model.eval()
        total_val_loss = 0
        total_correct = 0
        total_examples = 0

        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{epochs} - Validation"):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                total_val_loss += loss.item()

                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)
                total_correct += (predictions == batch['labels']).sum().item()
                total_examples += batch['labels'].size(0)

        avg_val_loss = total_val_loss / len(val_dataloader)
        val_accuracy = total_correct / total_examples
        print(f"Average validation loss: {avg_val_loss:.4f}")
        print(f"Validation accuracy: {val_accuracy:.4f}")

        # Save the best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            # Construct the filename for saving the model
            save_path = os.path.join(saved_models_path, f"{model_name}_best_model.pth")
            torch.save(model.state_dict(), save_path)
            print(f"Saved improved model at {save_path}")

In [17]:
train_and_validate(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    optimizer=optimizer,
    device=device,
    epochs=10
)

Epoch 1/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.07it/s]


Average training loss: 0.4820


Epoch 1/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.32it/s]


Average validation loss: 0.3848
Validation accuracy: 0.8050
Saved improved model at ./SavedModels/DistilBertForSequenceClassification_best_model.pth


Epoch 2/10 - Training: 100%|██████████| 50/50 [00:23<00:00,  2.08it/s]


Average training loss: 0.2898


Epoch 2/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.25it/s]


Average validation loss: 0.4318
Validation accuracy: 0.7950


Epoch 3/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.06it/s]


Average training loss: 0.1741


Epoch 3/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.20it/s]


Average validation loss: 0.4371
Validation accuracy: 0.8400
Saved improved model at ./SavedModels/DistilBertForSequenceClassification_best_model.pth


Epoch 4/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.05it/s]


Average training loss: 0.0939


Epoch 4/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.14it/s]


Average validation loss: 0.4127
Validation accuracy: 0.8700
Saved improved model at ./SavedModels/DistilBertForSequenceClassification_best_model.pth


Epoch 5/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.04it/s]


Average training loss: 0.0517


Epoch 5/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.17it/s]


Average validation loss: 0.5200
Validation accuracy: 0.8150


Epoch 6/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.04it/s]


Average training loss: 0.0217


Epoch 6/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.12it/s]


Average validation loss: 0.5418
Validation accuracy: 0.8550


Epoch 7/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.03it/s]


Average training loss: 0.0195


Epoch 7/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.11it/s]


Average validation loss: 0.6604
Validation accuracy: 0.8450


Epoch 8/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.03it/s]


Average training loss: 0.0135


Epoch 8/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.15it/s]


Average validation loss: 0.7645
Validation accuracy: 0.8350


Epoch 9/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.03it/s]


Average training loss: 0.0086


Epoch 9/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.15it/s]


Average validation loss: 0.6567
Validation accuracy: 0.8550


Epoch 10/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.03it/s]


Average training loss: 0.0094


Epoch 10/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.12it/s]

Average validation loss: 0.6374
Validation accuracy: 0.8500





In [127]:
def classify_papers(model, dataloader, device, tokenizer):
    """Classify papers using the trained model and return a DataFrame with texts and their classifications."""
    model.eval()
    predictions = []
    texts = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            batch_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in batch['input_ids']]
            texts.extend(batch_texts)
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())

    # Create DataFrame
    results_df = pd.DataFrame({
        'Title_and_Abstract': texts,
        'Accepted_for_Full_Text': predictions
    })

    return results_df

In [128]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,  # Ensure this matches the setup during training
    output_attentions=False,
    output_hidden_states=False,
)

# Step 2: Load the saved model weights
model_path = saved_model_dir+'DistilBertForSequenceClassification_best_model.pth'
model.load_state_dict(torch.load(model_path))

model.to(device)
model.eval()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [129]:
results_df = classify_papers(model, test_dataloader, device, tokenizer)

In [130]:
results_df.tail(10)

Unnamed: 0,Title_and_Abstract,Accepted_for_Full_Text
10493,fog based integrated nutrient management syste...,0
10494,an automated irrigation system for agriculture...,1
10495,using a compositional function hybridization o...,0
10496,a study on secure network slicing in 5g,0
10497,advancing cattle welfare : ultra low - power h...,1
10498,wireless sensor network based greenhouse monit...,0
10499,agrostring 2. 0 : a distributed - ledger based...,0
10500,performance of routing protocol for low - powe...,0
10501,churn - tolerant leader election protocols,0
10502,holistic technologies for managing internet of...,0


In [131]:
relevant_df = results_df[results_df['Accepted_for_Full_Text'] == 1]
not_relevant_df = results_df[results_df['Accepted_for_Full_Text'] == 0]

In [132]:
relevant_df.shape, not_relevant_df.shape

((2261, 2), (8242, 2))

In [133]:
relevant_df['Title_and_Abstract']

12       novel method for crop growth tracking with dee...
21       aiot in agriculture : safeguarding crops from ...
30       conceptualizing a holistic smart dairy farming...
41       plant disease detection : electronic system de...
42       improving deep learning classifiers performanc...
                               ...                        
10484    temperature and humidity control algorithm for...
10491                       iot based precision agri - bot
10492    a fog - based smart agriculture system to dete...
10494    an automated irrigation system for agriculture...
10497    advancing cattle welfare : ultra low - power h...
Name: Title_and_Abstract, Length: 2261, dtype: object

In [134]:
print(relevant_df['Title_and_Abstract'][41])

plant disease detection : electronic system design empowered with artificial intelligence … of diseases in agriculture. with the continuous development of deep learning from one hand, … a smart crop growth monitoring using edge artificial intelligence ( ai ) was developed in [ 7 ] …


In [135]:
def extract_title(text):
    """Extracts and normalizes the title from a given text with ellipsis handling."""
    # Handling different types of ellipsis and spacing issues
    # Normalizing different ellipsis formats to a standard one
    normalized_text = text.replace(' ... ', '...').replace('…', '...')
    
    # Splitting the text at the first occurrence of '...'
    title_part = normalized_text.split('...')[0].strip()
    
    # Normalize the extracted title part
    normalized_title = normalize_title(title_part)
    return normalized_title


def extract_titles(df):
    """Extract titles from text entries in the DataFrame using the updated function."""
    df['Title'] = df['Title_and_Abstract'].apply(extract_title)
    return df

def parse_bib_file_to_dict(file_path):
    with open(file_path, encoding='utf-8') as bibtex_file:
        parser = BibTexParser(common_strings=True)
        bib_database = bibtexparser.load(bibtex_file, parser=parser)
    return bib_database.entries


def normalize_title(title):
    """Normalize a title by removing extra spaces, converting to lowercase, and cleaning special characters."""
    title = re.sub(r'\s+', ' ', title)  # Replace multiple whitespaces with single space
    title = re.sub(r'[^\w\s]', '', title)  # Remove punctuation
    return title.lower().strip()



def extract_matching_bib_entries(predicted_titles, original_bib_entries):
    normalized_predicted_titles = [normalize_title(title) for title in predicted_titles]
    matching_entries = []

    for entry in original_bib_entries:
        normalized_bib_title = normalize_title(entry.get('title', ''))
        if normalized_bib_title in normalized_predicted_titles:
            matching_entries.append(entry)

    return matching_entries

In [136]:
extract_titles(relevant_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title_and_Abstract'].apply(extract_title)


Unnamed: 0,Title_and_Abstract,Accepted_for_Full_Text,Title
12,novel method for crop growth tracking with dee...,1,novel method for crop growth tracking with dee...
21,aiot in agriculture : safeguarding crops from ...,1,aiot in agriculture safeguarding crops from p...
30,conceptualizing a holistic smart dairy farming...,1,conceptualizing a holistic smart dairy farming...
41,plant disease detection : electronic system de...,1,plant disease detection electronic system des...
42,improving deep learning classifiers performanc...,1,improving deep learning classifiers performanc...
...,...,...,...
10484,temperature and humidity control algorithm for...,1,temperature and humidity control algorithm for...
10491,iot based precision agri - bot,1,iot based precision agri bot
10492,a fog - based smart agriculture system to dete...,1,a fog based smart agriculture system to detec...
10494,an automated irrigation system for agriculture...,1,an automated irrigation system for agriculture...


In [137]:
extract_titles(relevant_df)['Title_and_Abstract']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title_and_Abstract'].apply(extract_title)


12       novel method for crop growth tracking with dee...
21       aiot in agriculture : safeguarding crops from ...
30       conceptualizing a holistic smart dairy farming...
41       plant disease detection : electronic system de...
42       improving deep learning classifiers performanc...
                               ...                        
10484    temperature and humidity control algorithm for...
10491                       iot based precision agri - bot
10492    a fog - based smart agriculture system to dete...
10494    an automated irrigation system for agriculture...
10497    advancing cattle welfare : ultra low - power h...
Name: Title_and_Abstract, Length: 2261, dtype: object

In [160]:
extract_titles(relevant_df)['Title'].tolist()[:20]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title_and_Abstract'].apply(extract_title)


['novel method for crop growth tracking with deep learning model on an edge rail camera',
 'aiot in agriculture  safeguarding crops from pest and disease threats',
 'conceptualizing a holistic smart dairy farming system',
 'plant disease detection  electronic system design empowered with artificial intelligence',
 'improving deep learning classifiers performance via preprocessing and class imbalance approaches in a plant disease detection pipeline',
 'real  time droplet detection for agricultural spraying systems  a deep learning approach',
 'paddy crop disease detection using deep learning techniques',
 'detecting volunteer cotton plants in a corn field with deep learning on uav remote  sensing imagery',
 'toxaicology  the evolving role of artificial intelligence in advancing toxicology and modernizing regulatory science',
 'cottonsense  a high  throughput field phenotyping system for cotton fruit segmentation and enumeration on edge devices',
 'design iot  based smart agriculture to 

In [139]:
main_dir = "/home/jovyan/work/Projects/Edge-AI-Research-Title-Abstract-Screening/data/"
original_bib_file_dir = main_dir + "Papers_after_duplicates.bib"

In [140]:
original_bib_entries = parse_bib_file_to_dict(original_bib_file_dir)

In [141]:

# Example usage
predicted_titles = extract_titles(relevant_df)['Title'].tolist()
matching_bib_entries = extract_matching_bib_entries(predicted_titles, original_bib_entries)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title_and_Abstract'].apply(extract_title)


In [143]:
normalize_title('real - time droplet detection for agricultural spraying systems : a deep learning approach')

'real  time droplet detection for agricultural spraying systems  a deep learning approach'

In [159]:
predicted_titles[-1]

'advancing cattle welfare  ultra low  power health monitoring at the edge'

In [148]:
len(matching_bib_entries), len(predicted_titles)

(1273, 2261)

In [150]:
def extract_matching_and_unmatched_bib_entries(predicted_titles, original_bib_entries):
    matched_entries = []
    unmatched_entries = []

    # Normalize predicted titles for accurate matching
    normalized_predicted_titles = set([normalize_title(title) for title in predicted_titles])

    # Track matched titles to identify unmatched ones later
    matched_titles = set()

    for entry in original_bib_entries:
        normalized_bib_title = normalize_title(entry.get('title', ''))
        if normalized_bib_title in normalized_predicted_titles:
            matched_entries.append(entry)
            matched_titles.add(normalized_bib_title)
        else:
            unmatched_entries.append(entry)

    # Optionally, you can further identify unmatched predicted titles
    unmatched_predicted_titles = normalized_predicted_titles - matched_titles

    return matched_entries, unmatched_entries, list(unmatched_predicted_titles)

# Example usage:
matched_entries, unmatched_entries, unmatched_predicted_titles = extract_matching_and_unmatched_bib_entries(predicted_titles, original_bib_entries)

# Save unmatched entries to a new .bib file if needed
# save_bib_file(unmatched_entries, 'unmatched_entries.bib')


In [155]:
unmatched_predicted_titles[:10]

['',
 'technology based streamlined agro farming techniques',
 'deep learning based weed detection using uav images a comparative study',
 'minute wise frost prediction an approach of recurrent neural networks frost events incur substantial economic losses to farmers these events could induce damage to plants and crops by damaging the cells in this article a recurrent neural network based method automating the frost prediction process is proposed the recurrent neural network based models leveraged in this article include the standard recurrent neural network long short term memory and gated recurrent unit the proposed method aims to increase the prediction frequency from once per 12 24 h for the next day or night events to minute wise predictions for the next hour events to achieve this goal datasets from nsw and act of australia are obtained the experiments are designed considering the scene of deploying the model to the internet of things systems factors such as model processing spee

In [161]:
matched_entries[:10]

[{'annote': 'Query date: 2024-03-27 10:39:16',
  'note': 'Publisher: papers.ssrn.com',
  'author': 'Kum, S. W. and Moon, J. and Oh, S. and Suh, H. K. and Park, H. and {...}',
  'journal': 'Available at SSRN …',
  'abstract': '… With the advent of deep learning technology, recent studies are focusing on crop … agriculture is keep increasing, from conventional image processing to cutting-edge deep learning …',
  'url': 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4583976',
  'title': 'Novel {Method} for {Crop} {Growth} {Tracking} with {Deep} {Learning} {Model} on an {Edge} {Rail} {Camera}',
  'ENTRYTYPE': 'article',
  'ID': 'kum_novel_nodate'},
 {'annote': 'Cited by: 0; All Open Access, Gold Open Access, Green Open Access',
  'note': 'Publisher: mdpi.com\nType: HTML',
  'year': '2023',
  'author': 'Blanco-Carmona, P. and Baeza-Moreno, L. and Hidalgo-Fort, E. and {...}',
  'journal': 'Sensors',
  'abstract': '… machine learning algorithm (Figure 4): Once a month, the server uses t

In [170]:
# from thefuzz import fuzz

# def fuzzy_match_titles(predicted_title, bib_titles):
#     matches = []
#     for bib_title in bib_titles:
#         # Calculate the similarity score using token set ratio
#         score = fuzz.token_set_ratio(predicted_title, bib_title)
#         if score > 75:  # You can adjust the threshold based on trial and error
#             matches.append(bib_title)
#     return matches

In [171]:
fuzzy_matches = fuzzy_match_titles(predicted_titles, original_bib_entries)

In [172]:
len(fuzzy_matches)

5300

In [173]:
fuzzy_matches

[{'annote': 'Query date: 2024-03-27 10:39:16',
  'note': 'Publisher: igi-global.com',
  'year': '2024',
  'author': 'Babu, C. V. S. and Saltonya, M. S. and Ganapathi, S. and {...}',
  'journal': 'Artificial Intelligence of …',
  'abstract': '… Machine learning finds applications across various domains … In agriculture, for instance, machine learning algorithms … AIoT continues to evolve, with emerging trends like edge AI, …',
  'url': 'https://www.igi-global.com/chapter/aiot-revolution/341887',
  'title': '{AIoT} {Revolution}: {Transforming} {Networking} {Productivity} for the {Digital} {Age}',
  'ENTRYTYPE': 'article',
  'ID': 'babu_aiot_2024'},
 {'annote': 'Query date: 2024-03-27 10:39:16',
  'year': '2024',
  'author': 'Jamil, A.',
  'publisher': 'gnosis.library.ucy.ac.cy',
  'abstract': '… in optimizing neural networks, particularly in the dynamic context of Edge-IoT devices. DRL combines deep learning with reinforcement learning, enabling an agent to make decisions …',
  'url': 'h