# Installs

In [1]:
import sys

# For conda, use the following syntax
# !conda install --yes --prefix {sys.prefix} library_name

# For pip, use the following syntax:
# !{sys.executable} -m pip install library_name

In [3]:
import xml.sax

class Article:
    def __init__(self, key, title="", journal="", year="", authors=[]):
        self.key = key
        self.title = title
        self.journal = journal
        self.year = year
        self.authors = authors
        
    def set_title(self, title):
        self.title = title
        
    def set_journal(self, journal):
        self.journal = journal
        
    def set_year(self, year):
        self.year = year
        
    def set_authors(self, authors):
        self.authors = authors
        
    def add_author(self, author):
        self.authors.append(author)
        
class Inproceedings:
    def __init__(self, key, title="", booktitle="", year="", authors=[]):
        self.key = key
        self.title = title
        self.booktitle = booktitle
        self.year = year
        self.authors = authors
        
    def set_title(self, title):
        self.title = title
        
    def set_booktitle(self, booktitle):
        self.booktitle = booktitle
        
    def set_year(self, year):
        self.year = year
        
    def set_authors(self, authors):
        self.authors = authors
        
    def add_author(self, author):
        self.authors.append(author)

class TagHandler( xml.sax.ContentHandler ):
    def __init__(self):
        self.article = None
        self.inproceedings = None
        self.current_tag = None
        self.article_count = 0
        self.inproceedings_count=0
        
    # Call when an element starts
    def startElement(self, tag, attributes):
        self.current_tag = tag
        if tag == "article":
            if self.inproceedings is not None:
                raise RuntimeException("Parse Error: start of article found before previous inproceedings end tag.")
            if self.article is not None:
                raise RuntimeException("Parse Error: start of article found before previous article end tag.")            
#             print("*****Article*****")
            self.article = Article(attributes.getValue("key"), authors=[]) 
            
        elif tag == "inproceedings":
            if self.inproceedings is not None:
                raise RuntimeException("Parse Error: start of inproceedings found before previous inproceedings end tag.")
            if self.article is not None:
                raise RuntimeException("Parse Error: start of inproceedings found before previous article end tag.")            
#             print("*****Inproceedings*****")
            self.inproceedings = Inproceedings(attributes.getValue("key"), authors=[]) 

    # Call when an elements ends
    def endElement(self, tag):
        if tag == "article":
#             print(self.article.__dict__)
            self.article_count += 1
#             print("***End Article***") 
            # TODO: save article object to database
            self.article = None
        elif tag == "inproceedings":
#             print(self.inproceedings.__dict__)
            self.inproceedings_count += 1
#             print("***End Inproceedings***") 
            # TODO: save inproceedings object to database
            self.inproceedings = None
        elif tag == "dblp":
            print(f"article count: {self.article_count}\ninproceedings count: {self.inproceedings_count}")
        self.current_tag = None

    # Call when a character is read
    def characters(self, content):
        if self.current_tag == "author":
            if self.article is not None:
                self.article.add_author(content)
            elif self.inproceedings is not None:
                self.inproceedings.add_author(content) 
        elif self.current_tag == "title":
            if self.article is not None:
                self.article.set_title(content)
            elif self.inproceedings is not None:
                self.inproceedings.set_title(content)    
        elif self.current_tag == "year":
            if self.article is not None:
                self.article.set_year(content)
            elif self.inproceedings is not None:
                self.inproceedings.set_year(content)
        elif self.current_tag == "journal":
            if self.article is not None:
                self.article.set_journal(content)
        elif self.current_tag == "booktitle":
            if self.inproceedings is not None:
                self.inproceedings.set_booktitle(content)
            
if( __name__ == "__main__"):
    # create an XMLReader
    parser = xml.sax.make_parser()
    # turn off namepsaces
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    # override the default ContextHandler
    Handler = TagHandler()
    parser.setContentHandler( Handler )
    parser.parse("dblp-2022-01-01.xml")

article count: 2738932
inproceedings count: 2956396
