In [1]:
import requests
import json
import html.parser

In [2]:
class Tag:
    def __init__(self, _type, content, start_pos, end_pos, attrs):
        self.type = _type
        self.content = content
        self.content_pos = start_pos
        self.attrs = attrs if attrs else []
        
    def __str__(self):
        return "Tag:\t{}\nContent:\t{}\nAttrs:\t{}\nPos:\t{}".format(
            self.type, self.content, ", ".join(self.attrs), self.content_pos)

In [3]:
req = requests.get("https://poodle.computing.dcu.ie/")


In [4]:
req.text

'\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n  <!--\n    Modified from the Debian original for Ubuntu\n    Last updated: 2014-03-19\n    See: https://launchpad.net/bugs/1288690\n  -->\n  <head>\n    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n    <title>Poodle ... a moodle for programming</title>\n\n  <body>\n<h1>Hi, welcome!</h1>\n<p>This web page is so nice that it welcomes you!</p>\n<p>Here you will find:</p>\n<ol>\n<li><a href="/moodle/course/view.php?id=4">ca268: Computer Programming 3</a></li>\n<li><a href="/moodle/course/view.php?id=5">ca318: Advanced Algorithms and AI search</a></li>\n\n</ol>\n  </body>\n</html>\n\n'

In [31]:
class MyHTMLParser(html.parser.HTMLParser):
    def __init__(self):
        super().__init__()
        self.stack = []
        self.tags = []
        
    def handle_starttag(self, tag, attrs):
        print("START:\t", tag)
        xy = self.getpos()
        start = (xy[0] - 1, xy[1] + len(tag) + 2) # Fix index (breaks tags like <meta ... />)
        
        self.stack.append((tag, start))
#         print("Start tag:", tag)
#         for attr in attrs:
#             print("     attr:", attr)
            
    def handle_endtag(self, tag):
        print("END:\t", tag)
        
        xy = self.getpos()
        end = (xy[0]-1, xy[1]) # Fix index
        
        while self.stack[-1][0] == "data":
            t_cont = self.stack.pop()
            
        self.tags.append(("/".join(t[0] for t in self.stack), self.stack[-1][1], end))
        self.stack.pop()
        
    def handle_startendtag(self, tag, attrs):
        pass
    
    def handle_data(self, data):
        xy = self.getpos()
        start = (xy[0] - 1, xy[1]) # Fix index
        d_split = data.split("\n")
        end = (start[0]+len(d_split)-1, len(d_split[-1]))
        
        self.stack.append(("data", start, end))
        
        

In [32]:
ps = MyHTMLParser()

In [33]:
ps.reset()
ps.feed(req.text)
ps.tags

START:	 html
START:	 head
START:	 title
END:	 title
START:	 body
START:	 h1
END:	 h1
START:	 p
END:	 p
START:	 p
END:	 p
START:	 ol
START:	 li
START:	 a
END:	 a
END:	 li
START:	 li
START:	 a
END:	 a
END:	 li
END:	 ol
END:	 body
END:	 html


[('data/data/html/data/data/head/data/data/title', (10, 11), (10, 46)),
 ('data/data/html/data/data/head/data/data/data/body/data/h1',
  (13, 4),
  (13, 16)),
 ('data/data/html/data/data/head/data/data/data/body/data/data/p',
  (14, 3),
  (14, 49)),
 ('data/data/html/data/data/head/data/data/data/body/data/data/data/p',
  (15, 3),
  (15, 22)),
 ('data/data/html/data/data/head/data/data/data/body/data/data/data/data/ol/data/li/a',
  (17, 7),
  (17, 72)),
 ('data/data/html/data/data/head/data/data/data/body/data/data/data/data/ol/data/li',
  (17, 4),
  (17, 76)),
 ('data/data/html/data/data/head/data/data/data/body/data/data/data/data/ol/data/data/li/a',
  (18, 7),
  (18, 83)),
 ('data/data/html/data/data/head/data/data/data/body/data/data/data/data/ol/data/data/li',
  (18, 4),
  (18, 87)),
 ('data/data/html/data/data/head/data/data/data/body/data/data/data/data/ol',
  (16, 4),
  (20, 0)),
 ('data/data/html/data/data/head/data/data/data/body', (12, 8), (21, 2)),
 ('data/data/html/data/da

In [8]:
def get_content(text, start_cr, end_cr):
    lines = text.split("\n")
    content = []
    
    if start_cr[0] == end_cr[0]:
        return lines[start_cr[0]][start_cr[1]:end_cr[1]]
    
    content.append(lines[start_cr[0]][start_cr[1]:])
    content += lines[start_cr[0] + 1:end_cr[0]]
    content.append(lines[end_cr[0]][:end_cr[1]])
    return "\n".join(content)

In [17]:
tags = []
for tag in ps.tags:
    _type = tag[0].split("/")[-1]
    
    t_cont = get_content(req.text, tag[1], tag[2])
    
    if tag[1][0] == tag[2][0] and tag[1][1] > tag[2][1]:
        start_pos = tag[2]
    else:
        start_pos = tag[1]
    
    end_pos = tag[2]
    
    
    tags.append(Tag(_type, t_cont, start_pos, end_pos, []))



In [18]:
print(*tags, sep="\n")

Tag:	data
Content:	Poodle ... a moodle for programming
Attrs:	
Pos:	(10, 11)
Tag:	data
Content:	Hi, welcome!
Attrs:	
Pos:	(13, 4)
Tag:	data
Content:	This web page is so nice that it welcomes you!
Attrs:	
Pos:	(14, 3)
Tag:	data
Content:	Here you will find:
Attrs:	
Pos:	(15, 3)
Tag:	data
Content:	ca268: Computer Programming 3
Attrs:	
Pos:	(17, 43)
Tag:	a
Content:	href="/moodle/course/view.php?id=4">ca268: Computer Programming 3</a>
Attrs:	
Pos:	(17, 7)
Tag:	data
Content:	ca318: Advanced Algorithms and AI search
Attrs:	
Pos:	(18, 43)
Tag:	a
Content:	href="/moodle/course/view.php?id=5">ca318: Advanced Algorithms and AI search</a>
Attrs:	
Pos:	(18, 7)
Tag:	data
Content:	


Attrs:	
Pos:	(18, 92)
Tag:	data
Content:	
  
Attrs:	
Pos:	(20, 5)
Tag:	data
Content:	

Attrs:	
Pos:	(21, 9)
