# Installs

In [1]:
import sys

# For conda, use the following syntax
# !conda install --yes --prefix {sys.prefix} library_name

# For pip, use the following syntax:
# !{sys.executable} -m pip install library_name
# !{sys.executable} -m pip install psycopg2-binary

In [19]:
import xml.sax
import psycopg2
from psycopg2 import Error

class Article:
    def __init__(self, key, title=None, journal=None, year=None, authors=[]):
        self.key = key
        self.title = title
        self.journal = journal
        self.year = year
        self.authors = authors
    
    # Setters        
    def set_title(self, title):
        self.title = title
        
    def set_journal(self, journal):
        self.journal = journal
        
    def set_year(self, year):
        self.year = year
        
    def set_authors(self, authors):
        self.authors = authors
        
    def add_author(self, author):
        self.authors.append(author)
    
    # Getters
    def get_key(self):
        return self.key
        
    def get_title(self):
        return self.title
        
    def get_journal(self):
        return self.journal
        
    def get_year(self):
        return self.year
    
    def get_authors(self):
        return self.authors
        
class Inproceedings:
    def __init__(self, key, title=None, booktitle=None, year=None, authors=[]):
        self.key = key
        self.title = title
        self.booktitle = booktitle
        self.year = year
        self.authors = authors
    
    # Setters
    def set_title(self, title):
        self.title = title
        
    def set_booktitle(self, booktitle):
        self.booktitle = booktitle
        
    def set_year(self, year):
        self.year = year
        
    def set_authors(self, authors):
        self.authors = authors
        
    def add_author(self, author):
        self.authors.append(author)
        
    # Getters
    def get_key(self):
        return self.key
        
    def get_title(self):
        return self.title
        
    def get_booktitle(self):
        return self.booktitle
        
    def get_year(self):
        return self.year
    
    def get_authors(self):
        return self.authors
        

class TagHandler( xml.sax.ContentHandler ):
    def __init__(self):
        self.article = None
        self.inproceedings = None
        self.current_tag = None
        self.article_count = 0
        self.inproceedings_count=0
        try:
            # Open Database Connection
            self.connection = psycopg2.connect(user="dblpuser",
                                                password="dblpus3r",
                                                host="localhost",
                                                port="5432",
                                                database="dblp")
            # Open a cursor to perform database operations
            self.cursor = self.connection.cursor()
            # Print PostgreSQL details
            print("PostgreSQL server information")
            print(self.connection.get_dsn_parameters(), "\n")
            # Executing a SQL query
            self.cursor.execute("SELECT version();")
            # Fetch result
            record = self.cursor.fetchone()
            print("You are connected to - ", record, "\n")
        except (Exception, Error) as error:
            print("Error while connecting to PostgreSQL", error)
            if self.connection is not None:
                self.cursor.close()
                self.connection.close()
                print("PostgreSQL connection is closed in __init__")
            
        
    # Call when an element starts
    def startElement(self, tag, attributes):
        self.current_tag = tag
        if tag == "article":
            if self.inproceedings is not None:
                raise RuntimeException("Parse Error: start of article found before previous inproceedings end tag.")
            if self.article is not None:
                raise RuntimeException("Parse Error: start of article found before previous article end tag.")
            self.article = Article(attributes.getValue("key"), authors=[]) 
            
        elif tag == "inproceedings":
            if self.inproceedings is not None:
                raise RuntimeException("Parse Error: start of inproceedings found before previous inproceedings end tag.")
            if self.article is not None:
                raise RuntimeException("Parse Error: start of inproceedings found before previous article end tag.") 
            self.inproceedings = Inproceedings(attributes.getValue("key"), authors=[]) 

    # Call when an elements ends
    def endElement(self, tag):
        if tag == "article":
            
            # Save article to database
            try:
                article_key = f"'{self.article.get_key()}'" if self.article.get_key() is not None else 'NULL'
                article_title = f"'{self.article.get_title()}'" if self.article.get_title() is not None else 'NULL'
                article_journal = f"'{self.article.get_journal()}'" if self.article.get_journal() is not None else 'NULL'
                article_year = f"'{self.article.get_year()}'" if self.article.get_year() is not None else 'NULL'
                insert_query =  f"INSERT INTO public.\"Article\" (pubkey, title, journal, year) VALUES (" \
                                f"{article_key}, " \
                                f"{article_title}, " \
                                f"{article_journal}, " \
                                f"{article_year})"
                self.cursor.execute(insert_query)
                self.connection.commit()
                print("1 Record inserted successfully")
            except (Exception, Error) as error:
                print(error)
            
            
            # TODO: save authors to database            
            
            
            self.article_count += 1
            self.article = None
        elif tag == "inproceedings":
            
            
            # TODO: Save inproceedings to database
            
            
            
            # TODO: Save authors to database
            
            
            
            self.inproceedings_count += 1
            self.inproceedings = None
        self.current_tag = None
    
    # Call when document ends
    def endDocument(self):
        print(f"(end document) article count: {self.article_count}\ninproceedings count: {self.inproceedings_count}")
        if self.connection is not None:
            self.cursor.close()
            self.connection.close()
            print("PostgreSQL connection is closed in endDocument")

    # Call when a character is read
    def characters(self, content):
        if self.current_tag == "author":
            if self.article is not None:
                self.article.add_author(content)
            elif self.inproceedings is not None:
                self.inproceedings.add_author(content) 
        elif self.current_tag == "title":
            if self.article is not None:
                self.article.set_title(content)
            elif self.inproceedings is not None:
                self.inproceedings.set_title(content)    
        elif self.current_tag == "year":
            if self.article is not None:
                self.article.set_year(content)
            elif self.inproceedings is not None:
                self.inproceedings.set_year(content)
        elif self.current_tag == "journal":
            if self.article is not None:
                self.article.set_journal(content)
        elif self.current_tag == "booktitle":
            if self.inproceedings is not None:
                self.inproceedings.set_booktitle(content)
                
    def closeConnection(self):
        if self.connection is not None:
            self.cursor.close()
            self.connection.close()
            print("PostgreSQL connection is closed in closeConnection")
        
            
if( __name__ == "__main__"):
    # create an XMLReader
    parser = xml.sax.make_parser()
    # turn off namepsaces
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    # override the default ContextHandler
    Handler = TagHandler()
    parser.setContentHandler( Handler )
    parser.parse("dblp-small.xml")

PostgreSQL server information
{'user': 'dblpuser', 'channel_binding': 'prefer', 'dbname': 'dblp', 'host': 'localhost', 'port': '5432', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'sslsni': '1', 'ssl_min_protocol_version': 'TLSv1.2', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'target_session_attrs': 'any'} 

You are connected to -  ('PostgreSQL 14.1 on x86_64-apple-darwin20.6.0, compiled by Apple clang version 12.0.0 (clang-1200.0.32.29), 64-bit',) 

duplicate key value violates unique constraint "Article_pkey"
DETAIL:  Key (pubkey)=(dblpnote/error) already exists.

(end document) article count: 1
inproceedings count: 0
PostgreSQL connection is closed in endDocument
