# Ranked Retrival Using PyLucene
## Abhisek Sarkar
## Learnt from CS4201 Information Retrival and Web Search course work IISER Kolkata

### Indexer

In [None]:
# Import necessary packages
import os
import lucene
from bs4 import BeautifulSoup
from java.nio.file import Paths

# Lucene-specific imports
from org.apache.lucene.analysis.en import EnglishAnalyzer
from org.apache.lucene.document import Document, Field, TextField
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import NIOFSDirectory

# Progress bar library
from tqdm import tqdm


In [None]:
class PaperIndexer:
    """
    This class indexes academic papers in PDF format using PyLucene.
    """

    def __init__(self, index_path):
        """
        Initializes the Lucene virtual machine and creates an IndexWriter object.

        Args:
            index_path (str): Path to the directory where the Lucene index will be stored.
        """
        lucene.initVM()
        self.writer = IndexWriter(
            NIOFSDirectory(Paths.get(index_path)),
            IndexWriterConfig(EnglishAnalyzer())
        )

    def index_papers(self, xml_directory, overwrite=False):
        """
        Indexes all academic papers (in XML format) within a directory.

        Args:
            xml_directory (str): Path to the directory containing the XML files.
            overwrite (bool, optional): If True, deletes existing index before indexing. Defaults to False.
        """
        if overwrite:
            self.writer.deleteAll()

        for filename in tqdm(os.listdir(xml_directory)):
            file_path = os.path.join(xml_directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'xml')

                # Extract paper metadata
                title = soup.find('title').text.strip()
                abstract = soup.find('abstract').text.strip()
                authors = []
                for author in soup.find_all('author'):
                    if not author.find('persName'):
                        continue
                    first_name = author.find('forename').text.strip() if author.find('forename') else ''
                    last_name = author.find('surname').text.strip() if author.find('surname') else ''
                    authors.append(f"{first_name} {last_name}".strip())
                author_list = ', '.join(authors)
                content = soup.body.get_text(separator=' ').strip()

                # Create Lucene document and add fields
                document = Document()
                document.add(TextField('title', title, Field.Store.YES))
                document.add(TextField('abstract', abstract, Field.Store.YES))
                document.add(TextField('authors', author_list, Field.Store.YES))
                document.add(TextField('content', content, Field.Store.YES))

                self.writer.addDocument(document)

        self.writer.commit()


### PDF Parser

In [None]:
from grobid_client.grobid_client import GrobidClient

class AcademicPaperParser:
    """
    This class utilizes GrobidClient to parse academic papers in PDF format and generate XML files.
    """

    def __init__(self, config_file_path="./config.json"):
        """
        Initializes a GrobidClient object using the provided configuration file path.

        Args:
            config_file_path (str, optional): Path to the Grobid client configuration file. Defaults to "./config.json".
        """
        self.config_path = config_file_path
        self.client = GrobidClient(config_path=self.config_path)

    def parse_papers(self, input_directory, output_directory, service="processFulltextDocument", number_of_files=10, verbosity=True, overwrite=True, generate_ids=False, consolidate_header=True, consolidate_citations=False, include_raw_citations=False, include_raw_affiliations=False, include_tei_coordinates=False, segment_sentences=False):
        """
        Parses a specified number of PDF files from the input directory and generates corresponding XML files in the output directory.

        Args:
            input_directory (str): Path to the directory containing the PDF files.
            output_directory (str): Path to the directory where the generated XML files will be stored.
            service (str, optional): Grobid service to be used for parsing. Defaults to "processFulltextDocument".
            number_of_files (int, optional): Number of PDF files to be parsed. Defaults to 10.
            verbosity (bool, optional): Controls the logging output from GrobidClient. Defaults to True.
            overwrite (bool, optional): If True, existing XML files in the output directory will be overwritten. Defaults to True.
            generate_ids (bool, optional): If True, Grobid will generate unique IDs for entities in the parsed text. Defaults to False.
            consolidate_header (bool, optional): If True, Grobid will consolidate header information. Defaults to True.
            consolidate_citations (bool, optional): If True, Grobid will consolidate citation information. Defaults to False.
            include_raw_citations (bool, optional): If True, Grobid will include raw citation data in the output. Defaults to False.
            include_raw_affiliations (bool, optional): If True, Grobid will include raw affiliation data in the output. Defaults to False.
            include_tei_coordinates (bool, optional): If True, Grobid will include TEI coordinates in the output. Defaults to False.
            segment_sentences (bool, optional): If True, Grobid will segment the text into sentences. Defaults to False.
        """

        self.client.process(
            input_path=input_directory,
            output=output_directory,
            service=service,
            n=number_of_files,
            verbose=verbosity,
            force=overwrite,
            generateIDs=generate_ids,
            consolidate_header=consolidate_header,
            consolidate_citations=consolidate_citations,
            include_raw_citations=include_raw_citations,
            include_raw_affiliations=include_raw_affiliations,
            teiCoordinates=include_tei_coordinates,
            segment_sentences=segment_sentences
        )

if __name__ == "__main__":
    parser = AcademicPaperParser()
    parser.parse_papers("pdfs", "parsed_xmls", verbose=True, overwrite=True, number_of_files=10)


### Searcher

In [None]:
import lucene

from java.nio.file import Paths
from org.apache.lucene.analysis.en import EnglishAnalyzer
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.search.similarities import BM25Similarity
from org.apache.lucene.store import NIOFSDirectory


class AcademicPaperSearcher:
    """
    This class searches an indexed collection of academic papers using Lucene.
    """

    def __init__(self, index_path):
        """
        Initializes a Lucene IndexSearcher object for the specified index path.

        Args:
            index_path (str): Path to the Lucene index directory.
        """
        lucene.initVM()
        self.searcher = IndexSearcher(
            DirectoryReader.open(NIOFSDirectory(Paths.get(index_path)))
        )
        self.searcher.setSimilarity(BM25Similarity())

    def search(self, search_query, search_field, number_of_results=10):
        """
        Searches the Lucene index for documents matching the query within a specific field.

        Args:
            search_query (str): The query string to be used for searching.
            search_field (str): The field within the documents to search in.
            number_of_results (int, optional): The maximum number of search results to return. Defaults to 10.

        Returns:
            list: A list of dictionaries containing information about the retrieved documents.
        """
        parsed_query = QueryParser(search_field, EnglishAnalyzer()).parse(search_query)
        top_hits = self.searcher.search(parsed_query, number_of_results).scoreDocs

        results = []
        for hit in top_hits:
            document = self.searcher.doc(hit.doc)
            results.append({
                'title': document.get('title'),
                'abstract': document.get('abstract'),
                'authors': document.get('authors'),
                'body': document.get('body'),
                'score': hit.score
            })

        return results


### main

In [None]:
# Initialize objects
indexer = PaperIndexer()
searcher = AcademicPaperSearcher("index")

# Parse PDFs and generate XMLs
AcademicPaperParser().parse_papers("pdfs", "xmls", verbose=False, overwrite=False, number_of_files=10)

# Index the generated XML files
indexer.index_papers("xmls", overwrite=True)


# Main search loop
if __name__ == "__main__":
    while True:
        query = input("Query ⇨ ")
        if query == "exit":
            break
        field = input("Field ⇨ ")
        results = searcher.search(query, field)
        for i, result in enumerate(results):
            print(f"{i + 1}. {result['title']} | [{result['score']:.2f}]")
            print()