# This is a practice notebook following the course below:
https://kbnlresearch.github.io/xml-workshop/index.html

In [1]:
import os
import json
data = os.path.join(os.getcwd(), 'xml-workshop-main', 'data')

## Working with Beautiful Soup

In [2]:
from bs4 import BeautifulSoup

In [3]:
with open(data + "/example.xml") as f:
    root = BeautifulSoup(f, 'xml')

In [4]:
# Structure of the file
print(root)

<?xml version="1.0" encoding="utf-8"?>
<catalog>
<book id="bk101">
<author>
<name>Matthew</name>
<surname>Gambardella</surname>
</author>
<title>XML Developer's Guide</title>
<genre>Computer</genre>
<price>44.95</price>
<publish_date>2000-10-01</publish_date>
<description>An in-depth look at creating applications 
      with XML.</description>
</book>
<book id="bk102">
<author>
<name>Kim</name>
<surname>Ralls</surname>
</author>
<title>Midnight Rain</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2000-12-16</publish_date>
<description>A former architect battles corporate zombies, 
      an evil sorceress, and her own childhood to become queen 
      of the world.</description>
</book>
<book id="bk103">
<author>
<name>Eva</name>
<surname>Corets</surname>
</author>
<title>Maeve Ascendant</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2000-11-17</publish_date>
<description>After the collapse of a nanotechnology 
      society in England, the young surviv

In [5]:
# Get all book tags
for book in root.find_all('book'):
    title = book.find('title').text
    print(title)

XML Developer's Guide
Midnight Rain
Maeve Ascendant
Oberon's Legacy
The Sundered Grail
Lover Birds
Splish Splash
Creepy Crawlies
Paradox Lost
Microsoft .NET: The Programming Bible
MSXML3: A Comprehensive Guide
Visual Studio 7: A Comprehensive Guide


In [6]:
# Get all book tags
for book in root.find_all('book'):
    title = book.find('description').text
    print(title)

An in-depth look at creating applications 
      with XML.
A former architect battles corporate zombies, 
      an evil sorceress, and her own childhood to become queen 
      of the world.
After the collapse of a nanotechnology 
      society in England, the young survivors lay the 
      foundation for a new society.
In post-apocalypse England, the mysterious 
      agent known only as Oberon helps to create a new life 
      for the inhabitants of London. Sequel to Maeve 
      Ascendant.
The two daughters of Maeve, half-sisters, 
      battle one another for control of England. Sequel to 
      Oberon's Legacy.
When Carla meets Paul at an ornithology 
      conference, tempers fly as feathers get ruffled.
A deep sea diver finds true love twenty 
      thousand leagues beneath the sea.
An anthology of horror stories about roaches,
      centipedes, scorpions  and other insects.
After an inadvertant trip through a Heisenberg
      Uncertainty Device, James Salway discovers the proble

In [7]:
# Get all names
for book in root.find_all('author'):
    name = book.find('name').text
    surname = book.find('surname').text
    print(name, surname)  

Matthew Gambardella
Kim Ralls
Eva Corets
Eva Corets
Eva Corets
Cynthia Randall
Paula Thurman
Stefan Knorr
Peter Kress
Tim O'Brien
Tim O'Brien
Mike Galos


In [8]:
# Getting the attribute is the same as in HTML - 	<book id="bk101"> ... </book>
for book in root.find_all('book'):
    identifier = book.get('id')
    print(identifier)

bk101
bk102
bk103
bk104
bk105
bk106
bk107
bk108
bk109
bk110
bk111
bk112


In [9]:
for book in root.find_all('book'):
	identifier = book.get('id')
	title = book.find('title').text
	description = book.find('description').text
	for author in book.find_all('author'):
		name = author.find('name').text
		surname = author.find('surname').text
	print(identifier, title, description, name, surname)

bk101 XML Developer's Guide An in-depth look at creating applications 
      with XML. Matthew Gambardella
bk102 Midnight Rain A former architect battles corporate zombies, 
      an evil sorceress, and her own childhood to become queen 
      of the world. Kim Ralls
bk103 Maeve Ascendant After the collapse of a nanotechnology 
      society in England, the young survivors lay the 
      foundation for a new society. Eva Corets
bk104 Oberon's Legacy In post-apocalypse England, the mysterious 
      agent known only as Oberon helps to create a new life 
      for the inhabitants of London. Sequel to Maeve 
      Ascendant. Eva Corets
bk105 The Sundered Grail The two daughters of Maeve, half-sisters, 
      battle one another for control of England. Sequel to 
      Oberon's Legacy. Eva Corets
bk106 Lover Birds When Carla meets Paul at an ornithology 
      conference, tempers fly as feathers get ruffled. Cynthia Randall
bk107 Splish Splash A deep sea diver finds true love twenty 
      

In [10]:
# Make books into a table
import pandas as pd
booklist = []

for book in root.find_all('book'):
    identifier = book.get('id')
    title = book.find('title').text
    description = book.find('description').text
    for author in book.find_all('author'):
        name = author.find('name').text
        surname = author.find('surname').text
    booklist.append([identifier, title, description, name+" "+surname])
    
books = pd.DataFrame(booklist, columns=["identifier", "title", "description", "name"])
books.head()

Unnamed: 0,identifier,title,description,name
0,bk101,XML Developer's Guide,An in-depth look at creating applications \n ...,Matthew Gambardella
1,bk102,Midnight Rain,"A former architect battles corporate zombies, ...",Kim Ralls
2,bk103,Maeve Ascendant,After the collapse of a nanotechnology \n ...,Eva Corets
3,bk104,Oberon's Legacy,"In post-apocalypse England, the mysterious \n ...",Eva Corets
4,bk105,The Sundered Grail,"The two daughters of Maeve, half-sisters, \n ...",Eva Corets


## Writing an SAX parser
- startElement encounters the item and initialises a dict of the item's keys.
- characters only gathers information.
- in endElement, further processing can be done where keys that are not in the XML can be given values out of processed key values.
- at the end the dictionary is exported as a json file

In [11]:
import xml.sax
import re

class BookHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.currentElement = ''
        self.currentBook = {}
        self.books = []

    def startElement(self, name, attrs):
        self.currentElement = name
        if name == 'book':
            self.currentBook = {'id_no': attrs.get('id')[2:], 'title': '', 'title_length': 0, 'name': '', 'surname': '', 'name_abrev': ''}

    def characters(self, content):
        if self.currentElement in ['title', 'name', 'surname']:
            self.currentBook[self.currentElement] += content.strip()

    def endElement(self, name):
        if name == 'title':
            self.currentBook['title_length'] = len(self.currentBook['title'].strip().split())
        elif name == 'surname':
            self.currentBook['name_abrev'] = re.sub(r'[a-z]', '', self.currentBook['name'].strip() + ' ' + self.currentBook['surname'].strip()).replace(' ', '.') + '.'
        elif name == 'book':
            print(f"Recorded book: id number is {self.currentBook['id_no']}, length of title {self.currentBook['title_length']}, and the author is {self.currentBook['name_abrev']}")
            self.books.append(self.currentBook)
            self.currentBook = {}
            self.currentElement = ''


In [12]:
xmldata = data + "/example.xml"
parser = xml.sax.make_parser()
handler = BookHandler()
parser.setContentHandler(handler)
parser.parse(xmldata)

Recorded book: id number is 101, length of title 3, and the author is M.G.
Recorded book: id number is 102, length of title 2, and the author is K.R.
Recorded book: id number is 103, length of title 2, and the author is E.C.
Recorded book: id number is 104, length of title 2, and the author is E.C.
Recorded book: id number is 105, length of title 3, and the author is E.C.
Recorded book: id number is 106, length of title 2, and the author is C.R.
Recorded book: id number is 107, length of title 2, and the author is P.T.
Recorded book: id number is 108, length of title 2, and the author is S.K.
Recorded book: id number is 109, length of title 2, and the author is P.K.
Recorded book: id number is 110, length of title 5, and the author is T.O'B.
Recorded book: id number is 111, length of title 4, and the author is T.O'B.
Recorded book: id number is 112, length of title 6, and the author is M.G.


In [13]:
handler.books # The entirety of the dictionary (the list) resembles json, so can be converted!

[{'id_no': '101',
  'title': "XML Developer's Guide",
  'title_length': 3,
  'name': 'Matthew',
  'surname': 'Gambardella',
  'name_abrev': 'M.G.'},
 {'id_no': '102',
  'title': 'Midnight Rain',
  'title_length': 2,
  'name': 'Kim',
  'surname': 'Ralls',
  'name_abrev': 'K.R.'},
 {'id_no': '103',
  'title': 'Maeve Ascendant',
  'title_length': 2,
  'name': 'Eva',
  'surname': 'Corets',
  'name_abrev': 'E.C.'},
 {'id_no': '104',
  'title': "Oberon's Legacy",
  'title_length': 2,
  'name': 'Eva',
  'surname': 'Corets',
  'name_abrev': 'E.C.'},
 {'id_no': '105',
  'title': 'The Sundered Grail',
  'title_length': 3,
  'name': 'Eva',
  'surname': 'Corets',
  'name_abrev': 'E.C.'},
 {'id_no': '106',
  'title': 'Lover Birds',
  'title_length': 2,
  'name': 'Cynthia',
  'surname': 'Randall',
  'name_abrev': 'C.R.'},
 {'id_no': '107',
  'title': 'Splish Splash',
  'title_length': 2,
  'name': 'Paula',
  'surname': 'Thurman',
  'name_abrev': 'P.T.'},
 {'id_no': '108',
  'title': 'Creepy Crawlies

In [15]:
with open('books_json.json', 'w') as f:
    json.dump(handler.books, f, indent= 4)