# JSON parsing

In [4]:
import urllib.request # instead of urllib2 like in Python 2.7
import json

In [5]:
def printResults(data):
  # Use the json module to load the string data into a dictionary
  theJSON = json.loads(data)
  
  # now we can access the contents of the JSON like any other Python object
  if "title" in theJSON["metadata"]:
    print (theJSON["metadata"]["title"])
  
  # output the number of events, plus the magnitude and each event name  
  count = theJSON["metadata"]["count"];
  print (str(count) + " events recorded")
  
  # for each event, print the place where it occurred
  for i in theJSON["features"]:
    print (i["properties"]["place"])
  print ("--------------\n")

  # print the events that only have a magnitude greater than 4
  for i in theJSON["features"]:
    if i["properties"]["mag"] >= 4.0:
      print ("%2.1f" % i["properties"]["mag"], i["properties"]["place"])
  print ("--------------\n")

  # print only the events where at least 1 person reported feeling something
  print ("\n\nEvents that were felt:")
  for i in theJSON["features"]:
    feltReports = i["properties"]["felt"]
    if (feltReports != None):
      if (feltReports > 0):
        print ("%2.1f" % i["properties"]["mag"], i["properties"]["place"], " reported " + str(feltReports) + " times")


In [6]:
# define a variable to hold the source URL
# In this case we'll use the free data feed from the USGS
# This feed lists all earthquakes for the last day larger than Mag 2.5
urlData = "http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson"

# Open the URL and read the data
webUrl = urllib.request.urlopen(urlData)
print ("result code: " + str(webUrl.getcode()))
if (webUrl.getcode() == 200):
    data = webUrl.read()
    # print out our customized results
    printResults(data)
else:
    print ("Received an error from server, cannot retrieve results " + str(webUrl.getcode()))


result code: 200
USGS Magnitude 2.5+ Earthquakes, Past Day
30 events recorded
74 km N of Brenas, Puerto Rico
10 km NW of Stanley, Idaho
65 km SW of Akhiok, Alaska
6 km ESE of La Parguera, Puerto Rico
3 km SE of La Parguera, Puerto Rico
77 km E of Pangai, Tonga
0 km SW of Guánica, Puerto Rico
13 km SSW of Guánica, Puerto Rico
8 km N of Lambayong, Philippines
63 km E of Sungai Penuh, Indonesia
61 km ENE of Y, Alaska
14 km WSW of Point Possession, Alaska
3 km SW of Guánica, Puerto Rico
Southwest Indian Ridge
291 km NE of Lospalos, Timor Leste
26 km E of Honaunau-Napoopoo, Hawaii
173 km NNW of Sola, Vanuatu
6km SSE of Pedley, CA
11 km SE of Larsen Bay, Alaska
28 km WNW of Kingston, Nevada
123 km S of McCarthy, Alaska
26 km ESE of North Vanlaiphai, India
63 km N of Hihifo, Tonga
26 km SW of Goldfield, Nevada
29 km SSE of Mina, Nevada
10 km SSE of Nov, Tajikistan
5 km SW of Guánica, Puerto Rico
138 km ENE of Kuril’sk, Russia
18 km NNE of Hualien City, Taiwan
37 km SSE of Champerico, Guatemal

# Reading HTML data from a website

In [7]:
# open a connection to a URL using urllib2
webUrl = urllib.request.urlopen("http://www.google.com")

# get the result code and print it
print ("result code: " + str(webUrl.getcode()))

# read the data from the URL and print it
data = webUrl.read()
print (data)

result code: 200
b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en-IN"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="hFp6bfl/5/g7d6Lp+vAlqA==">(function(){window.google={kEI:\'eOEHX8u1Odif4-EP_5qQsAk\',kEXPI:\'0,202123,3,4,32,1151584,5663,730,224,5104,207,3204,10,1226,364,926,573,817,383,246,5,1354,406,242,195,485,5,361,2184,234,80,105,97,20,3,324,46,210,470,43,48,193,251,72,1121692,1197709,440,329118,1294,12383,4855,32691,15248,867,28684,9188,8384,4858,1362,9291,3026,3891,850,11033,1808,4020,978,4784,1,3146,5297,2054,920,873,1217,8989,1,415,1141,7512,2653,3221,236,4282,2777,919,2277,8,4389,1279,2212,241,291,147,1103,840,518,1521,4258,312,1132,4,3,2063,606,1839,184,1920,377,1947,2229,93,328,1285,16,2926,2247,473,1339,1787,3227,1989,856,7,4773,1295,6286,4454,642,2449,2459,1226,1406,336,3655,1274,108,3

In [11]:
# import the HTMLParser module
# in Python 3 you need to import from html.parser
from html.parser import HTMLParser

metacount = 0;

# create a subclass of HTMLParser and override the handler methods
class MyHTMLParser(HTMLParser):
  # function to handle an opening tag in the doc
  # this will be called when the closing ">" of the tag is reached
  def handle_starttag(self, tag, attrs):
    global metacount
    if tag == "meta":
      metacount += 1

    print ("Encountered a start tag:", tag)
    pos = self.getpos() # returns a tuple indication line and character
    print ("\tAt line: ", pos[0], " position ", pos[1])

    if attrs.__len__() > 0:
      print ("\tAttributes:")
      for a in attrs:
        print ("\t", a[0],"=",a[1])
      
  # function to handle the ending tag
  def handle_endtag(self, tag):
    print ("Encountered an end tag:", tag)
    pos = self.getpos()
    print ("\tAt line: ", pos[0], " position ", pos[1])
    
  # function to handle character and text data (tag contents)
  def handle_data(self, data):
    if (data.isspace()):
      return
    print ("Encountered some text data:", data)
    pos = self.getpos()
    print ("\tAt line: ", pos[0], " position ", pos[1])
  
  # function to handle the processing of HTML comments
  def handle_comment(self, data):
    print ("Encountered comment:", data)
    pos = self.getpos()
    print ("\tAt line: ", pos[0], " position ", pos[1])


In [12]:
# instantiate the parser and feed it some HTML
parser = MyHTMLParser()

# open the sample HTML file and read it
f = open("samplehtml.html")
if f.mode == "r":
    contents = f.read() # read the entire file
    parser.feed(contents)

print ("%d meta tags encountered" % metacount)


Encountered a start tag: html
	At line:  2  position  0
	Attributes:
	 lang = en
Encountered a start tag: head
	At line:  3  position  2
Encountered a start tag: meta
	At line:  4  position  4
	Attributes:
	 charset = utf-8
Encountered an end tag: meta
	At line:  4  position  4
Encountered a start tag: title
	At line:  5  position  4
Encountered some text data: Sample HTML Document
	At line:  5  position  11
Encountered an end tag: title
	At line:  5  position  31
Encountered a start tag: meta
	At line:  6  position  4
	Attributes:
	 name = description
	 content = This is a sample HTML file
Encountered an end tag: meta
	At line:  6  position  4
Encountered a start tag: meta
	At line:  7  position  4
	Attributes:
	 name = author
	 content = Administrator
Encountered an end tag: meta
	At line:  7  position  4
Encountered a start tag: meta
	At line:  8  position  4
	Attributes:
	 name = viewport
	 content = width=device-width; initial-scale=1.0
Encountered an end tag: meta
	At line:  8  p

# XML Parsing

In [15]:
import xml.dom.minidom

In [16]:
# use the parse() function to load and parse an XML file
doc = xml.dom.minidom.parse("samplexml.xml")

# print out the document node and the name of the first child tag
print (doc.nodeName)
print (doc.firstChild.tagName)

# get a list of XML tags from the document and print each one
skills = doc.getElementsByTagName("skill")
print ("%d skills:" % skills.length)
for skill in skills:
    print (skill.getAttribute("name"))

# create a new XML tag and add it into the document
newSkill = doc.createElement("skill")
newSkill.setAttribute("name", "jQuery")
doc.firstChild.appendChild(newSkill)

skills = doc.getElementsByTagName("skill")
print ("%d skills:" % skills.length)
for skill in skills:
    print (skill.getAttribute("name"))


#document
person
4 skills:
JavaScript
Python
C#
HTML
5 skills:
JavaScript
Python
C#
HTML
jQuery
