In [1]:
import os
from whoosh.index import create_in
from whoosh.writing import BufferedWriter, AsyncWriter
from whoosh.analysis import StandardAnalyzer, SimpleAnalyzer
from whoosh.fields import *
import xml.etree.ElementTree as ET
import lxml.etree
import numpy as np
import time
import concurrent

In [2]:
doc_type = {k: 1 for k in ['inproceedings', 'incollection', 'book', 'proceedings', 'article', '']}
features = {'title': ('title', 'booktitle'),
            'year': ('year'),
            'author': ('author', 'editor'),
            'publication_venue': ('publisher', 'series', 'school', 'journal')}

In [3]:
standard_analyzer = StandardAnalyzer()
simple_analyzer = SimpleAnalyzer()

In [4]:
# Define schema
schema = Schema(
    title=TEXT(analyzer=standard_analyzer, stored=True),
    author=TEXT(analyzer=simple_analyzer, stored=True),
    year=NUMERIC(stored=True),
    publication_venue = TEXT(analyzer=standard_analyzer, stored=True)
)

In [5]:
# Create index directory if not exists
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

In [6]:
# Create an index
ix = create_in("indexdir", schema=schema, indexname="dblp")

In [7]:
class Parser():
    def __init__(self):
        self.idx = 0
        # self.buffer = []
        self.instance = {k: list() for k in features.keys()}
        self.time = time.time()
        self.timing = list([0])
        self.percent = 0
        self.flag = False

    def parse(self, file = './dblp.xml', dtd = './dblp.dtd'):
        for (event, element) in lxml.etree.iterparse(file, dtd_validation=True, events=['end'], load_dtd=dtd):
            if self.idx == 0:
                writer = AsyncWriter(ix, writerargs={'procs':6})
            if element.tag in doc_type:
                children = element.getchildren()
                self.idx += 1
                for child in children:
                    if child.tag in features['title']:
                        if child.text:
                            self.instance['title'].append(child.text)
                            self.flag = True
                    elif child.tag in features['author']:
                       if child.text:
                            self.instance['author'].append(child.text)
                            self.flag = True
                    elif child.tag in features['year']:
                        if child.text:
                            self.instance['year'].append(child.text)
                            self.flag = True
                    elif child.tag in features['publication_venue']:
                        if child.text:
                            self.instance['publication_venue'].append(child.text)
                            self.flag = True
            if self.flag:
                # print(self.idx)
                title = ', '.join(self.instance['title'])
                author = ', '.join(self.instance['author'])
                if self.instance['year'] == []:
                    self.instance['year'] = ['0']
                year = ', '.join(self.instance['year'])
                # print(self.instance)
                publication_venue = ', '.join(self.instance['publication_venue'])

                writer.add_document(title=title,
                                author=author,
                                year=year,
                                publication_venue=publication_venue)
                self.instance = {k: list() for k in features.keys()}
                self.flag = False
                if self.idx == 67867 or (element.tag == 'dblp' and event == 'end'):
                    self.percent += 1
                    self.timing.append(time.time() - self.time)
                    print(f"-- Finish indexing {self.percent}% of the corpus in {self.timing[-1]} seconds")
                    self.idx = 0
                    writer.commit(optimize=True)
        self.timing.append(time.time() - self.time)
        print(f"-- Finish indexing the corpus in {self.timing[-1]} seconds")
            # if self.flag:
            #     self.buffer.append(self.instance)
            #     self.instance = {k: list() for k in features.keys()}
            #     self.flag = False
            # if self.idx == 67866 or (element.tag == 'dblp' and event == 'end'):
            #     self.percent += 1
            #     ## idexing here
            #     for doc in self.buffer:
            #         # Handle None values in the fields
            #         title = ', '.join(doc['title'])
            #         author= ', '.join(doc['author'])
                    
            #         if doc['year'] == []:
            #             doc['year'] = ['0']
            #         year = ', '.join(doc['year'])
            #         publication_venue = ', '.join(doc['publication_venue'])
                    # if title == "":
                    #     print(doc)
                    # elif author == "":
                    #     print(doc)
                    # elif year == "":
                    #     print(doc)
                    # elif publication_venue == "":
                    #     print(doc)
                    # Add the document to the writer
                #     writer.add_document(title=title,
                #                 author=author,
                #                 year=year,
                #                 publication_venue=publication_venue)
                # if self.percent % 10 == 0 or (element.tag == 'dblp' and event == 'end'):
                #     self.timing.append(time.time() - self.time)
                #     print(f"-- Finish indexing {self.percent}% of the corpus in {self.timing[-1]} seconds")
                # self.idx = 0
                # self.buffer.clear()
       #writer.close()
    


In [8]:
timing = []
parser = Parser()
parser.parse()

-- Finish indexing 1% of the corpus in 6.990043878555298 seconds
-- Finish indexing 2% of the corpus in 22.50542950630188 seconds
-- Finish indexing 3% of the corpus in 39.62352395057678 seconds
-- Finish indexing 4% of the corpus in 61.91430354118347 seconds
-- Finish indexing 5% of the corpus in 99.30147504806519 seconds
-- Finish indexing 6% of the corpus in 138.27858352661133 seconds
-- Finish indexing 7% of the corpus in 155.332261800766 seconds
-- Finish indexing 8% of the corpus in 171.87531924247742 seconds
-- Finish indexing 9% of the corpus in 190.29377794265747 seconds
-- Finish indexing 10% of the corpus in 210.10067009925842 seconds
-- Finish indexing 11% of the corpus in 229.2938461303711 seconds
-- Finish indexing 12% of the corpus in 247.00334358215332 seconds
-- Finish indexing 13% of the corpus in 279.2983179092407 seconds
-- Finish indexing 14% of the corpus in 313.4430034160614 seconds
-- Finish indexing 15% of the corpus in 382.60352873802185 seconds
-- Finish inde

AttributeError: 'AsyncWriter' object has no attribute 'close'