### XML handling with the ElementTree package
#####  https://www.datacamp.com/community/tutorials/python-xml-elementtree

In [1]:
import xml.etree.ElementTree as ET
import json
import os

In [2]:
file = "c:\\Users\\BalazsBalogh\\Downloads\\movies.xml" 

In [3]:
# XML documents have sections, called elements, defined by a beginning and an ending tag.
# Elements can contain markup, including other elements, which are called "child elements". 
# The largest, top-level element is called the root, which contains all other elements.

# Create the parsed file, and initialize.

tree = ET.parse(file)
root = tree.getroot()

print(root.tag) # This is the highest level.
print(root.attrib) # It doesn't have any attributes, as you can see one cell below.

collection
{}


In [4]:
# Get back the whole element, to inspect.

# "collection" is the root element, it contains other elements, like "genre" or "decace". 
# "movie" element contains attributes like "favorite" or "title".

print(ET.tostring(root, encoding='utf8').decode('utf8'))

<?xml version='1.0' encoding='utf8'?>
<collection>
    <genre category="Action">
        <decade years="1980s">
            <movie favorite="True" title="Indiana Jones: The raiders of the lost Ark">
                <format multiple="No">DVD</format>
                <year>1981</year>
                <rating>PG</rating>
                <description>
                'Archaeologist and adventurer Indiana Jones 
                is hired by the U.S. government to find the Ark of the 
                Covenant before the Nazis.'
                </description>
            </movie>
               <movie favorite="True" title="THE KARATE KID">
               <format multiple="Yes">DVD,Online</format>
               <year>1984</year>
               <rating>PG</rating>
               <description>None provided.</description>
            </movie>
            <movie favorite="False" title="Back 2 the Future">
               <format multiple="False">Blu-ray</format>
               <year>1985</year>
  

In [5]:
# It's easy to iterate over subelements with a for loop.

for child in root:
    print("tag name:", child.tag, "||| attribute:", child.attrib)

tag name: genre ||| attribute: {'category': 'Action'}
tag name: genre ||| attribute: {'category': 'Thriller'}
tag name: genre ||| attribute: {'category': 'Comedy'}


In [6]:
# Get all the elements in the entire tree.

[elem.tag for elem in root.iter()]

['collection',
 'genre',
 'decade',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'decade',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'genre',
 'decade',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'decade',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'genre',
 'decade',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'decade',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'decade',
 'movie',
 'format',
 'year',
 'rating',
 'description',
 'decade',
 'movie',
 'format',
 'year',
 'rating',
 'description']

In [7]:
# We can iterate over any element, just specify it in iter().

for movie in root.iter('movie'):
    print(movie.attrib)

{'favorite': 'True', 'title': 'Indiana Jones: The raiders of the lost Ark'}
{'favorite': 'True', 'title': 'THE KARATE KID'}
{'favorite': 'False', 'title': 'Back 2 the Future'}
{'favorite': 'False', 'title': 'X-Men'}
{'favorite': 'True', 'title': 'Batman Returns'}
{'favorite': 'False', 'title': 'Reservoir Dogs'}
{'favorite': 'False', 'title': 'ALIEN'}
{'favorite': 'True', 'title': "Ferris Bueller's Day Off"}
{'favorite': 'FALSE', 'title': 'American Psycho'}
{'favorite': 'False', 'title': 'Batman: The Movie'}
{'favorite': 'True', 'title': 'Easy A'}
{'favorite': 'True', 'title': 'Dinner for SCHMUCKS'}
{'favorite': 'False', 'title': 'Ghostbusters'}
{'favorite': 'True', 'title': 'Robin Hood: Prince of Thieves'}


In [8]:
# The attribute is built by keys and values

for decade in root.iter('decade'):
    print(decade.attrib)
    print(decade.attrib.keys())
    print(decade.attrib.values())
    print(type(decade.attrib))
    print('\n')

{'years': '1980s'}
dict_keys(['years'])
dict_values(['1980s'])
<class 'dict'>


{'years': '1990s'}
dict_keys(['years'])
dict_values(['1990s'])
<class 'dict'>


{'years': '1970s'}
dict_keys(['years'])
dict_values(['1970s'])
<class 'dict'>


{'years': '1980s'}
dict_keys(['years'])
dict_values(['1980s'])
<class 'dict'>


{'years': '1960s'}
dict_keys(['years'])
dict_values(['1960s'])
<class 'dict'>


{'years': '2010s'}
dict_keys(['years'])
dict_values(['2010s'])
<class 'dict'>


{'years': '1980s'}
dict_keys(['years'])
dict_values(['1980s'])
<class 'dict'>


{'years': '1990s'}
dict_keys(['years'])
dict_values(['1990s'])
<class 'dict'>




In [9]:
for formats in root.iter('format'):
    print(formats.attrib)

{'multiple': 'No'}
{'multiple': 'Yes'}
{'multiple': 'False'}
{'multiple': 'Yes'}
{'multiple': 'No'}
{'multiple': 'No'}
{'multiple': 'Yes'}
{'multiple': 'No'}
{'multiple': 'No'}
{'multiple': 'Yes'}
{'multiple': 'No'}
{'multiple': 'Yes'}
{'multiple': 'No'}
{'multiple': 'No'}


In [10]:
# If we need the content of an element, use the .text method.

for formats in root.iter('format'):
    print(formats.text)

DVD
DVD,Online
Blu-ray
dvd, digital
VHS
Online
DVD
DVD
blue-ray
DVD,VHS
DVD
DVD,digital,Netflix
Online,VHS
Blu_Ray


In [11]:
# Another example for the text method.

for description in root.iter('description'):
    print(description.text)


                'Archaeologist and adventurer Indiana Jones 
                is hired by the U.S. government to find the Ark of the 
                Covenant before the Nazis.'
                
None provided.
Marty McFly
Two mutants come to a private academy for their kind whose resident superhero team must 
               oppose a terrorist organization with similar powers.
NA.
WhAtEvER I Want!!!?!
"""""""""
Funny movie about a funny guy
psychopathic Bateman
What a joke!
Emma Stone = Hester Prynne
Tim (Rudd) is a rising executive
                 who succeeds in finding the perfect guest.
				 
Who ya gonna call?
Robin Hood slaying


In [12]:
# Example for the findall method. It will search the immediate children of the referenced element.

for genre in root.findall("./genre"):
    print(genre.attrib)
    
print('\n')

for decade in root.findall("./genre/decade"):
    print(decade.attrib)

print('\n')

for movie in root.findall("./genre/decade/movie"):
    print(movie.attrib)

{'category': 'Action'}
{'category': 'Thriller'}
{'category': 'Comedy'}


{'years': '1980s'}
{'years': '1990s'}
{'years': '1970s'}
{'years': '1980s'}
{'years': '1960s'}
{'years': '2010s'}
{'years': '1980s'}
{'years': '1990s'}


{'favorite': 'True', 'title': 'Indiana Jones: The raiders of the lost Ark'}
{'favorite': 'True', 'title': 'THE KARATE KID'}
{'favorite': 'False', 'title': 'Back 2 the Future'}
{'favorite': 'False', 'title': 'X-Men'}
{'favorite': 'True', 'title': 'Batman Returns'}
{'favorite': 'False', 'title': 'Reservoir Dogs'}
{'favorite': 'False', 'title': 'ALIEN'}
{'favorite': 'True', 'title': "Ferris Bueller's Day Off"}
{'favorite': 'FALSE', 'title': 'American Psycho'}
{'favorite': 'False', 'title': 'Batman: The Movie'}
{'favorite': 'True', 'title': 'Easy A'}
{'favorite': 'True', 'title': 'Dinner for SCHMUCKS'}
{'favorite': 'False', 'title': 'Ghostbusters'}
{'favorite': 'True', 'title': 'Robin Hood: Prince of Thieves'}


In [13]:
# Here's another example, if we look for the movies came out in 1992.

for movie in root.findall("./genre/decade/movie/[year='1992']"):
    print(movie.attrib['title'])

Batman Returns
Reservoir Dogs


In [28]:
for movie in root.findall("./genre/decade/movie/format/[@multiple='Yes']"):
    print(movie.attrib)

{'multiple': 'Yes'}
{'multiple': 'Yes'}
{'multiple': 'Yes'}
{'multiple': 'Yes'}
{'multiple': 'Yes'}


In [15]:
# Use '...' to return the parent element of the current element.

for movie in root.findall("./genre/decade/movie/format/[@multiple='Yes']..."):
    print(movie.attrib)
    print(movie.attrib['title'])
    print('\n')

{'favorite': 'True', 'title': 'THE KARATE KID'}
THE KARATE KID


{'favorite': 'False', 'title': 'X-Men'}
X-Men


{'favorite': 'False', 'title': 'ALIEN'}
ALIEN


{'favorite': 'False', 'title': 'Batman: The Movie'}
Batman: The Movie


{'favorite': 'True', 'title': 'Dinner for SCHMUCKS'}
Dinner for SCHMUCKS




In [16]:
# Modifying an XML. Let's look at the titles again. We could fix some of them.

for movie in root.iter('movie'):
    print(movie.attrib)

{'favorite': 'True', 'title': 'Indiana Jones: The raiders of the lost Ark'}
{'favorite': 'True', 'title': 'THE KARATE KID'}
{'favorite': 'False', 'title': 'Back 2 the Future'}
{'favorite': 'False', 'title': 'X-Men'}
{'favorite': 'True', 'title': 'Batman Returns'}
{'favorite': 'False', 'title': 'Reservoir Dogs'}
{'favorite': 'False', 'title': 'ALIEN'}
{'favorite': 'True', 'title': "Ferris Bueller's Day Off"}
{'favorite': 'FALSE', 'title': 'American Psycho'}
{'favorite': 'False', 'title': 'Batman: The Movie'}
{'favorite': 'True', 'title': 'Easy A'}
{'favorite': 'True', 'title': 'Dinner for SCHMUCKS'}
{'favorite': 'False', 'title': 'Ghostbusters'}
{'favorite': 'True', 'title': 'Robin Hood: Prince of Thieves'}


In [17]:
# Let's find "Back to the future", which is spelled wrong here. Store it in b2tf.

b2tf = root.find("./genre/decade/movie[@title='Back 2 the Future']")
print(b2tf)
print(b2tf.attrib['title'])

<Element 'movie' at 0x000002DBD9F75408>
Back 2 the Future


In [18]:
# Change the title to the original one.

b2tf.attrib["title"] = "Back to the Future"
print(b2tf.attrib)

{'favorite': 'False', 'title': 'Back to the Future'}


In [19]:
corrected_file = "c:\\Users\\BalazsBalogh\\Downloads\\movies_corrected.xml"

tree.write(corrected_file)

tree = ET.parse(corrected_file)
root = tree.getroot()

for movie in root.iter('movie'):
    print(movie.attrib)

{'favorite': 'True', 'title': 'Indiana Jones: The raiders of the lost Ark'}
{'favorite': 'True', 'title': 'THE KARATE KID'}
{'favorite': 'False', 'title': 'Back to the Future'}
{'favorite': 'False', 'title': 'X-Men'}
{'favorite': 'True', 'title': 'Batman Returns'}
{'favorite': 'False', 'title': 'Reservoir Dogs'}
{'favorite': 'False', 'title': 'ALIEN'}
{'favorite': 'True', 'title': "Ferris Bueller's Day Off"}
{'favorite': 'FALSE', 'title': 'American Psycho'}
{'favorite': 'False', 'title': 'Batman: The Movie'}
{'favorite': 'True', 'title': 'Easy A'}
{'favorite': 'True', 'title': 'Dinner for SCHMUCKS'}
{'favorite': 'False', 'title': 'Ghostbusters'}
{'favorite': 'True', 'title': 'Robin Hood: Prince of Thieves'}


In [20]:
# The "format" element's "multiple" attribute has some values to correct.

for form in root.findall("./genre/decade/movie/format"):
    print(form.attrib, form.text)

{'multiple': 'No'} DVD
{'multiple': 'Yes'} DVD,Online
{'multiple': 'False'} Blu-ray
{'multiple': 'Yes'} dvd, digital
{'multiple': 'No'} VHS
{'multiple': 'No'} Online
{'multiple': 'Yes'} DVD
{'multiple': 'No'} DVD
{'multiple': 'No'} blue-ray
{'multiple': 'Yes'} DVD,VHS
{'multiple': 'No'} DVD
{'multiple': 'Yes'} DVD,digital,Netflix
{'multiple': 'No'} Online,VHS
{'multiple': 'No'} Blu_Ray


In [21]:
# To make it simple and consistent, change a 'Yes' to 'No' where are multiple formats (e.g.: DVD,digital,Netflix).

import re

for form in root.findall("./genre/decade/movie/format"):
    # Search for the commas in the format text
    match = re.search(',',form.text)
    if match:
        form.set('multiple','Yes')
    else:
        form.set('multiple','No')
        
# Write out the newly improved file.    
tree.write(corrected_file)

tree = ET.parse(corrected_file)
root = tree.getroot()

for form in root.findall("./genre/decade/movie/format"):
    print(form.attrib, form.text)

{'multiple': 'No'} DVD
{'multiple': 'Yes'} DVD,Online
{'multiple': 'No'} Blu-ray
{'multiple': 'Yes'} dvd, digital
{'multiple': 'No'} VHS
{'multiple': 'No'} Online
{'multiple': 'No'} DVD
{'multiple': 'No'} DVD
{'multiple': 'No'} blue-ray
{'multiple': 'Yes'} DVD,VHS
{'multiple': 'No'} DVD
{'multiple': 'Yes'} DVD,digital,Netflix
{'multiple': 'Yes'} Online,VHS
{'multiple': 'No'} Blu_Ray


In [22]:
# Moving elements: Some of the data has been placed in the wrong decade.
# We can see, that in the '1990s' and '1980s', there are two movies that doesn't belong there. Both of them
# is from the year 2000.

for decade in root.findall('./genre/decade'):
    print(decade.attrib)
    
    for year in decade.findall('./movie/year'):
        print(year.text)
        
    print('\n')

{'years': '1980s'}
1981
1984
1985


{'years': '1990s'}
2000
1992
1992


{'years': '1970s'}
1979


{'years': '1980s'}
1986
2000


{'years': '1960s'}
1966


{'years': '2010s'}
2010
2011


{'years': '1980s'}
1984


{'years': '1990s'}
1991




In [23]:
# These are the movies. We only correct the X-Men in this tutorial.

for movie in root.findall("./genre/decade/movie/[year='2000']"):
    print(movie.attrib)

{'favorite': 'False', 'title': 'X-Men'}
{'favorite': 'FALSE', 'title': 'American Psycho'}


In [24]:
# We have to add a new decade to the 'Action' genre, where we could move the X-Men movie.

action = root.find("./genre[@category='Action']")
new_dec = ET.SubElement(action, 'decade')
new_dec.attrib["years"] = '2000s'

print(ET.tostring(action, encoding='utf8').decode('utf8'))

<?xml version='1.0' encoding='utf8'?>
<genre category="Action">
        <decade years="1980s">
            <movie favorite="True" title="Indiana Jones: The raiders of the lost Ark">
                <format multiple="No">DVD</format>
                <year>1981</year>
                <rating>PG</rating>
                <description>
                'Archaeologist and adventurer Indiana Jones 
                is hired by the U.S. government to find the Ark of the 
                Covenant before the Nazis.'
                </description>
            </movie>
               <movie favorite="True" title="THE KARATE KID">
               <format multiple="Yes">DVD,Online</format>
               <year>1984</year>
               <rating>PG</rating>
               <description>None provided.</description>
            </movie>
            <movie favorite="False" title="Back to the Future">
               <format multiple="No">Blu-ray</format>
               <year>1985</year>
               <ratin

In [25]:
# Find the X-Men movie, and append to the 2000s movies, and remove it from the 1990s.

xmen = root.find("./genre/decade/movie[@title='X-Men']")
dec2000s = root.find("./genre[@category='Action']/decade[@years='2000s']")
dec2000s.append(xmen)

dec1990s = root.find("./genre[@category='Action']/decade[@years='1990s']")
dec1990s.remove(xmen)

print(ET.tostring(action, encoding='utf8').decode('utf8'))

<?xml version='1.0' encoding='utf8'?>
<genre category="Action">
        <decade years="1980s">
            <movie favorite="True" title="Indiana Jones: The raiders of the lost Ark">
                <format multiple="No">DVD</format>
                <year>1981</year>
                <rating>PG</rating>
                <description>
                'Archaeologist and adventurer Indiana Jones 
                is hired by the U.S. government to find the Ark of the 
                Covenant before the Nazis.'
                </description>
            </movie>
               <movie favorite="True" title="THE KARATE KID">
               <format multiple="Yes">DVD,Online</format>
               <year>1984</year>
               <rating>PG</rating>
               <description>None provided.</description>
            </movie>
            <movie favorite="False" title="Back to the Future">
               <format multiple="No">Blu-ray</format>
               <year>1985</year>
               <ratin

In [26]:
# Save it back to the xml.

tree.write(corrected_file)

tree = ET.parse(corrected_file)
root = tree.getroot()

print(ET.tostring(root, encoding='utf8').decode('utf8'))

<?xml version='1.0' encoding='utf8'?>
<collection>
    <genre category="Action">
        <decade years="1980s">
            <movie favorite="True" title="Indiana Jones: The raiders of the lost Ark">
                <format multiple="No">DVD</format>
                <year>1981</year>
                <rating>PG</rating>
                <description>
                'Archaeologist and adventurer Indiana Jones 
                is hired by the U.S. government to find the Ark of the 
                Covenant before the Nazis.'
                </description>
            </movie>
               <movie favorite="True" title="THE KARATE KID">
               <format multiple="Yes">DVD,Online</format>
               <year>1984</year>
               <rating>PG</rating>
               <description>None provided.</description>
            </movie>
            <movie favorite="False" title="Back to the Future">
               <format multiple="No">Blu-ray</format>
               <year>1985</year>
    