In [109]:
import os
import re
import torch
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from fuzzywuzzy import fuzz

import bibtexparser
from bibtexparser.bparser import BibTexParser



In [2]:
saved_model_dir = "./SavedModels/"

In [3]:
train_val_dataset = pd.read_csv("./data/title_abstract_train_val_dataset.csv")
eval_dataset = pd.read_csv("./data/title_abstract_eval_dataset.csv")

In [4]:
train_val_dataset.tail(10)

Unnamed: 0,Title_and_Abstract,Accepted_for_Full_Text
990,Guest Editorial: An End-to-End Machine Learnin...,0
991,Guest Editorial Special Issue on Privacy and S...,0
992,Guest Editorial Special Issue on Emerging Tren...,0
993,Guest Editorial Special Issue on Advanced Cogn...,0
994,Enhancing Smart Agriculture Scenarios with Low...,1
995,High Voltage Discharge Exhibits Severe Effect ...,0
996,Heterogeneous GNN-RL-Based Task Offloading for...,1
997,Optimized Data Fusion With Scheduled Rest Peri...,0
998,FarmEdge: A Unified Edge Computing Framework E...,1
999,"5G Network: Architecture, Protocols, Challenge...",0


In [5]:
eval_dataset.head(5)

Unnamed: 0,Title_and_Abstract,Accepted_for_Full_Text
0,Influence of artificial intelligence (AI) on f...,0
1,Software engineering approaches for tinyml bas...,0
2,Artificial intelligence in practice: how 50 su...,0
3,Industry 4.0: Industrial internet of things (I...,0
4,Artificial intelligence and biological misuse:...,0


In [6]:
train_val_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Title_and_Abstract      996 non-null    object
 1   Accepted_for_Full_Text  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [7]:
eval_dataset.drop("Accepted_for_Full_Text", axis = 1, inplace=True)

In [8]:
eval_dataset

Unnamed: 0,Title_and_Abstract
0,Influence of artificial intelligence (AI) on f...
1,Software engineering approaches for tinyml bas...
2,Artificial intelligence in practice: how 50 su...
3,Industry 4.0: Industrial internet of things (I...
4,Artificial intelligence and biological misuse:...
...,...
10498,Wireless Sensor Network Based Greenhouse Monit...
10499,agroString 2.0: A Distributed-Ledger based Sma...
10500,Performance of Routing Protocol for Low-Power ...
10501,Churn-Tolerant Leader Election Protocols


In [9]:
train_data, val_data = train_test_split(train_val_dataset, test_size=0.2, random_state=42)
train_data.shape, val_data.shape

((800, 2), (200, 2))

In [10]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [11]:
train_encodings = tokenizer(train_data['Title_and_Abstract'].astype(str).tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_data['Title_and_Abstract'].astype(str).tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(eval_dataset['Title_and_Abstract'].astype(str).tolist(), truncation=True, padding=True, max_length=512)

In [12]:
class TrainValDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])


In [13]:
# Create the PyTorch datasets
train_dataset = TrainValDataset(train_encodings, train_data['Accepted_for_Full_Text'].tolist())
val_dataset = TrainValDataset(val_encodings, val_data['Accepted_for_Full_Text'].tolist())

test_dataset = TestDataset(test_encodings)

In [14]:
# Create DataLoaders for train, validation, and test
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [15]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Choose device available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def train_and_validate(model, train_dataloader, val_dataloader, optimizer, device, epochs=3, saved_models_path=saved_model_dir):
    """Train and validate the model.
    
    Args:
        model (torch.nn.Module): The model to train and validate.
        train_dataloader (DataLoader): DataLoader for training data.
        val_dataloader (DataLoader): DataLoader for validation data.
        optimizer (torch.optim.Optimizer): Optimizer for the model.
        device (torch.device): Device to run the model computation.
        epochs (int): Number of epochs to train the model.
        saved_models_path (str): Directory path where the model will be saved.
    """
    model = model.to(device)
    best_val_accuracy = 0

    # Get the model name from its class
    model_name = model.__class__.__name__

    total_steps = len(train_dataloader) * epochs
    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    for epoch in range(epochs):
        # Training Phase
        model.train()
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs} - Training"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.4f}")

        # Validation Phase
        model.eval()
        total_val_loss = 0
        total_correct = 0
        total_examples = 0

        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{epochs} - Validation"):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                total_val_loss += loss.item()

                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)
                total_correct += (predictions == batch['labels']).sum().item()
                total_examples += batch['labels'].size(0)

        avg_val_loss = total_val_loss / len(val_dataloader)
        val_accuracy = total_correct / total_examples
        print(f"Average validation loss: {avg_val_loss:.4f}")
        print(f"Validation accuracy: {val_accuracy:.4f}")

        # Save the best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            # Construct the filename for saving the model
            save_path = os.path.join(saved_models_path, f"{model_name}_best_model.pth")
            torch.save(model.state_dict(), save_path)
            print(f"Saved improved model at {save_path}")

In [17]:
train_and_validate(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    optimizer=optimizer,
    device=device,
    epochs=10
)

Epoch 1/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.07it/s]


Average training loss: 0.4820


Epoch 1/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.32it/s]


Average validation loss: 0.3848
Validation accuracy: 0.8050
Saved improved model at ./SavedModels/DistilBertForSequenceClassification_best_model.pth


Epoch 2/10 - Training: 100%|██████████| 50/50 [00:23<00:00,  2.08it/s]


Average training loss: 0.2898


Epoch 2/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.25it/s]


Average validation loss: 0.4318
Validation accuracy: 0.7950


Epoch 3/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.06it/s]


Average training loss: 0.1741


Epoch 3/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.20it/s]


Average validation loss: 0.4371
Validation accuracy: 0.8400
Saved improved model at ./SavedModels/DistilBertForSequenceClassification_best_model.pth


Epoch 4/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.05it/s]


Average training loss: 0.0939


Epoch 4/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.14it/s]


Average validation loss: 0.4127
Validation accuracy: 0.8700
Saved improved model at ./SavedModels/DistilBertForSequenceClassification_best_model.pth


Epoch 5/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.04it/s]


Average training loss: 0.0517


Epoch 5/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.17it/s]


Average validation loss: 0.5200
Validation accuracy: 0.8150


Epoch 6/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.04it/s]


Average training loss: 0.0217


Epoch 6/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.12it/s]


Average validation loss: 0.5418
Validation accuracy: 0.8550


Epoch 7/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.03it/s]


Average training loss: 0.0195


Epoch 7/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.11it/s]


Average validation loss: 0.6604
Validation accuracy: 0.8450


Epoch 8/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.03it/s]


Average training loss: 0.0135


Epoch 8/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.15it/s]


Average validation loss: 0.7645
Validation accuracy: 0.8350


Epoch 9/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.03it/s]


Average training loss: 0.0086


Epoch 9/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.15it/s]


Average validation loss: 0.6567
Validation accuracy: 0.8550


Epoch 10/10 - Training: 100%|██████████| 50/50 [00:24<00:00,  2.03it/s]


Average training loss: 0.0094


Epoch 10/10 - Validation: 100%|██████████| 13/13 [00:02<00:00,  6.12it/s]

Average validation loss: 0.6374
Validation accuracy: 0.8500





In [127]:
def classify_papers(model, dataloader, device, tokenizer):
    """Classify papers using the trained model and return a DataFrame with texts and their classifications."""
    model.eval()
    predictions = []
    texts = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            batch_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in batch['input_ids']]
            texts.extend(batch_texts)
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())

    # Create DataFrame
    results_df = pd.DataFrame({
        'Title_and_Abstract': texts,
        'Accepted_for_Full_Text': predictions
    })

    return results_df

In [128]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,  # Ensure this matches the setup during training
    output_attentions=False,
    output_hidden_states=False,
)

# Step 2: Load the saved model weights
model_path = saved_model_dir+'DistilBertForSequenceClassification_best_model.pth'
model.load_state_dict(torch.load(model_path))

model.to(device)
model.eval()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [129]:
results_df = classify_papers(model, test_dataloader, device, tokenizer)

In [130]:
results_df.tail(10)

Unnamed: 0,Title_and_Abstract,Accepted_for_Full_Text
10493,fog based integrated nutrient management syste...,0
10494,an automated irrigation system for agriculture...,1
10495,using a compositional function hybridization o...,0
10496,a study on secure network slicing in 5g,0
10497,advancing cattle welfare : ultra low - power h...,1
10498,wireless sensor network based greenhouse monit...,0
10499,agrostring 2. 0 : a distributed - ledger based...,0
10500,performance of routing protocol for low - powe...,0
10501,churn - tolerant leader election protocols,0
10502,holistic technologies for managing internet of...,0


In [205]:
relevant_df = results_df[results_df['Accepted_for_Full_Text'] == 1]
not_relevant_df = results_df[results_df['Accepted_for_Full_Text'] == 0]

In [206]:
relevant_df.shape, not_relevant_df.shape

((2261, 2), (8242, 2))

In [208]:
relevant_df

Unnamed: 0,Title_and_Abstract,Accepted_for_Full_Text
12,"novel method for crop growth tracking with deep learning model on an edge rail camera … with the advent of deep learning technology, recent studies are focusing on crop … agriculture is keep increasing, from conventional image processing to cutting - edge deep learning …",1
21,"aiot in agriculture : safeguarding crops from pest and disease threats … machine learning algorithm ( figure 4 ) : once a month, the server uses the collected environmental data to retrain each machine learning … this work permits the use of edge computing …",1
30,conceptualizing a holistic smart dairy farming system … machine - learning - based pre - processing will be passed over to a data broker handling all information flows of the edge ai … support for artificial intelligence ( ai ) in agriculture as funding …,1
41,"plant disease detection : electronic system design empowered with artificial intelligence … of diseases in agriculture. with the continuous development of deep learning from one hand, … a smart crop growth monitoring using edge artificial intelligence ( ai ) was developed in [ 7 ] …",1
42,improving deep learning classifiers performance via preprocessing and class imbalance approaches in a plant disease detection pipeline … ( cea ) has progressively come to represent the growth of modern agriculture … edge computing devices in our future work to achieve real - time plant disease diagnosis using edge - ai …,1
...,...,...
10484,temperature and humidity control algorithm for poultry farm control systems,1
10491,iot based precision agri - bot,1
10492,a fog - based smart agriculture system to detect animal intrusion,1
10494,an automated irrigation system for agriculture using iot,1


In [281]:
def parse_bib_file_to_dict(file_path):
    """Parse a BibTeX file and return a list of raw combined title and abstract entries."""
    with open(file_path, encoding='utf-8') as bibtex_file:
        parser = BibTexParser(common_strings=True)
        bib_database = bibtexparser.load(bibtex_file, parser=parser)

    entries_list = []

    for entry in bib_database.entries:
        # Extract title and abstract, handle missing fields
        title = entry.get('title', '').replace('{', '').replace('}', '').replace('\n', ' ').strip()
        abstract = entry.get('abstract', '').replace('{', '').replace('}', '').replace('\n', ' ').strip()
        
        # Combine the title and abstract directly without normalization
        text_feature = f"{title} {abstract}".strip()
        
        # Append the raw combined text to the list
        entries_list.append(text_feature)

    return entries_list


# def normalize_text(text):
#     """Normalize a text by removing extra spaces, converting to lowercase, and cleaning special characters."""
#     text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespaces with single space
#     text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
#     return text.lower().strip()


def extract_matching_bib_entries(predicted_texts, original_bib_entries):
    """Extract entries from original BibTeX data where combined title and abstract match the predicted texts."""
    normalized_predicted_texts = [normalize_text(text) for text in predicted_texts]
    matching_entries = []

    for entry in original_bib_entries:
        combined_text = entry.get('title', '') + " " + entry.get('abstract', '')
        normalized_bib_text = normalize_text(combined_text)
        if normalized_bib_text in normalized_predicted_texts:
            matching_entries.append(entry)

    return matching_entries



In [282]:
original_bib_file_dir = "data/Papers_after_duplicates.bib"

In [283]:
titles_and_abstracts = parse_bib_file_to_dict(original_bib_file_dir)

In [284]:
predicted_titles_and_abstracts = relevant_df['Title_and_Abstract'].astype(str).tolist()

In [232]:
# matched_entries = extract_matching_bib_entries(predicted_titles_and_abstracts, bib_entries)

In [234]:
len(matched_entries)

826

In [285]:
titles_and_abstracts[:5]

['Influence of artificial intelligence (AI) on firm performance: the business value of AI-based transformation projects … Transformation: ADNOC and IBM are relying on cutting-edge AI to develop an automated … This case study highlights the advantages of AI in the field of food and agriculture. Note that the …',
 'Software engineering approaches for tinyml based iot embedded vision: A systematic literature review … (Tiny Machine Learning) has enabled the deployment of ML models for embedded vision on extremely lean edge … It is projected that a total of 2.5 billion edge AI devices will ship with a …',
 'Artificial intelligence in practice: how 50 successful companies used AI and machine learning to solve problems … r Where Alphabet has seen breakthrough development in leading edge AI, such as deep learning, by research groups and start-ups, Google has used its financial resources to bring it …',
 'Industry 4.0: Industrial internet of things (IIOT) … Technologies and IoT have the potenti

In [286]:
predicted_titles_and_abstracts[:5]

['novel method for crop growth tracking with deep learning model on an edge rail camera … with the advent of deep learning technology, recent studies are focusing on crop … agriculture is keep increasing, from conventional image processing to cutting - edge deep learning …',
 'aiot in agriculture : safeguarding crops from pest and disease threats … machine learning algorithm ( figure 4 ) : once a month, the server uses the collected environmental data to retrain each machine learning … this work permits the use of edge computing …',
 'conceptualizing a holistic smart dairy farming system … machine - learning - based pre - processing will be passed over to a data broker handling all information flows of the edge ai … support for artificial intelligence ( ai ) in agriculture as funding …',
 'plant disease detection : electronic system design empowered with artificial intelligence … of diseases in agriculture. with the continuous development of deep learning from one hand, … a smart crop 

In [287]:
# normalized_predicted = [normalize_text(text) for text in predicted_titles_and_abstracts]

In [289]:
normalized_predicted[:4]

['novel method for crop growth tracking with deep learning model on an edge rail camera  with the advent of deep learning technology recent studies are focusing on crop  agriculture is keep increasing from conventional image processing to cutting  edge deep learning',
 'aiot in agriculture  safeguarding crops from pest and disease threats  machine learning algorithm  figure 4   once a month the server uses the collected environmental data to retrain each machine learning  this work permits the use of edge computing',
 'conceptualizing a holistic smart dairy farming system  machine  learning  based pre  processing will be passed over to a data broker handling all information flows of the edge ai  support for artificial intelligence  ai  in agriculture as funding',
 'plant disease detection  electronic system design empowered with artificial intelligence  of diseases in agriculture with the continuous development of deep learning from one hand  a smart crop growth monitoring using edge a

In [255]:
def match_entries(predicted_list, bib_entries_list):
    """Match predicted titles and abstracts against BibTeX entries and categorize into matched and unmatched."""
    matched = []
    unmatched = []

    # Convert bib_entries_list to a set for faster lookup
    bib_entries_set = set(bib_entries_list)

    for predicted in predicted_list:
        if predicted in bib_entries_set:
            matched.append(predicted)
        else:
            unmatched.append(predicted)

    return matched, unmatched

# Assuming 'normalized_predicted' and 'normalized_entries' are available from previous steps
matched_entries, unmatched_entries = match_entries(normalized_predicted, titles_and_abstracts)


In [258]:
len(matched_entries)

826

In [263]:
len(unmatched_entries)

1435

In [265]:
matched_entries[:10]

['towards a novel wearable solution for citrus inspection using edge ai  agriculture has benefited from computer vision  use deep learning on an edge ai specialized device for this matter we applied and evaluated the standard metrics for machine learning',
 'autonomous navigation in vineyards with deep learning at the edge  over the past years the agriculture industry is asked to respond  later advancement in deep learning and edge computing  on the other hand as edge ai embedded computational',
 'remote crop sensing with iot and ai on the edge  this edge computing approach utilizing artificial intelligence  and mechanised agriculture based on machine learning and  the self proclaimed edge ai is a concept of balance between',
 'chapter 16  role of artificial intelligence and the internet of things in agriculture the agricultural industry relies on innovative ideas and technological advancements to help increase yields and better allocate resources the late 19th and the 20th centuries b

In [266]:
unmatched_entries[:6]

['novel method for crop growth tracking with deep learning model on an edge rail camera  with the advent of deep learning technology recent studies are focusing on crop  agriculture is keep increasing from conventional image processing to cutting  edge deep learning',
 'aiot in agriculture  safeguarding crops from pest and disease threats  machine learning algorithm  figure 4   once a month the server uses the collected environmental data to retrain each machine learning  this work permits the use of edge computing',
 'conceptualizing a holistic smart dairy farming system  machine  learning  based pre  processing will be passed over to a data broker handling all information flows of the edge ai  support for artificial intelligence  ai  in agriculture as funding',
 'plant disease detection  electronic system design empowered with artificial intelligence  of diseases in agriculture with the continuous development of deep learning from one hand  a smart crop growth monitoring using edge a

In [267]:
def normalize_and_process_text(texts, tokenizer):
    """Normalize and process texts using a tokenizer and decode them back to strings."""
    normalized_texts = [normalize_text(text) for text in texts]  # Assuming normalize_text is already defined
    tokenized_texts = tokenizer(normalized_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
    decoded_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in tokenized_texts.input_ids]
    return decoded_texts

processed_extracted_texts = normalize_and_process_text(titles_and_abstracts, tokenizer)

In [272]:
processed_predicted_texts = normalize_and_process_text(normalized_predicted, tokenizer)

In [269]:
processed_extracted_texts[:5]

['influence of artificial intelligence ai on firm performance the business value of aibased transformation projects transformation adnoc and ibm are relying on cuttingedge ai to develop an automated this case study highlights the advantages of ai in the field of food and agriculture note that the',
 'software engineering approaches for tinyml based iot embedded vision a systematic literature review tiny machine learning has enabled the deployment of ml models for embedded vision on extremely lean edge it is projected that a total of 25 billion edge ai devices will ship with a',
 'artificial intelligence in practice how 50 successful companies used ai and machine learning to solve problems r where alphabet has seen breakthrough development in leading edge ai such as deep learning by research groups and startups google has used its financial resources to bring it',
 'industry 40 industrial internet of things iiot technologies and iot have the potential to transform agriculture in many on

In [273]:
matched_entries, unmatched_entries = match_entries(processed_predicted_texts,processed_extracted_texts)

In [275]:
len(matched_entries)

1172

In [279]:
unmatched_entries[900:]

['smart logistic system for enhancing the farmer customer corridor in smart agriculture sector using artificial intelligence',
 'real time riped fruit detection using faster r cnn deep neural network models',
 'improvement of precision in agriculture using iot based machine learning techniques',
 'plant disease detection using ai based vgg 16 model',
 'iot based portable weather station for irrigation management using real time parameters',
 'machine learning and internet of things iot for real time image classification in smart agriculture',
 'computer vision based smart agriculture storage with quality and quantity analysis and recipe suggestion',
 'ultrasonic sensor based canopy height measurement and root depth estimation',
 'a review on leaf based plant disease detection systems using machine learning',
 'a real time iot and image processing based weeds classification system for selective herbicide',
 'deep learning based detection of plant nutrient deficiency symptom and design o