In [3]:
from xml.etree import ElementTree as et
import requests

In [4]:
path = "http://feeds.bbci.co.uk/news/video_and_audio/news_front_page/rss.xml"
xml_string = requests.get(path).text
xml_string



In [5]:
root = et.fromstring(xml_string)

In [8]:
items = list(root.findall("channel/item"))

In [10]:
item = items[0]

In [14]:
item.find("title").text

"Firm in data scandal 'made election boasts'"

In [15]:
item.find("description").text

'The company that became Cambridge Analytica boasted about interfering in foreign elections, according to documents seen by the Sunday Politics.'

In [17]:
item.find("link").text

'https://www.bbc.co.uk/news/uk-politics-43532916'

In [18]:
item.find("guid").text

'https://www.bbc.co.uk/news/uk-politics-43532916'

In [31]:
guid = item.find("guid")

In [33]:
guid.attrib["isPermaLink"]

'true'

In [19]:
item.find("pubDate").text

'Sun, 25 Mar 2018 11:52:46 GMT'

In [27]:
class Article:
    title = None
    description = None
    link = None
    guid = None
    pubDate = None
    
    def __repr__(self):
        import json
        return json.dumps(self.__dict__)

In [28]:
articles = []
for item in items:
    article = Article()
    article.title = item.find("title").text
    article.description = item.find("description").text
    article.link = item.find("link").text
    article.guid = item.find("guid").text
    article.pubDate = item.find("pubDate").text
    articles.append(article)

In [29]:
articles

[{"title": "Firm in data scandal 'made election boasts'", "description": "The company that became Cambridge Analytica boasted about interfering in foreign elections, according to documents seen by the Sunday Politics.", "link": "https://www.bbc.co.uk/news/uk-politics-43532916", "guid": "https://www.bbc.co.uk/news/uk-politics-43532916", "pubDate": "Sun, 25 Mar 2018 11:52:46 GMT"},
 {"title": "Dealing with debt: The mini-bankers learning how to save", "description": "Schoolchildren from Kirton Primary in Boston run their own bank with its own currency to learn about saving money.", "link": "https://www.bbc.co.uk/news/uk-england-42664110", "guid": "https://www.bbc.co.uk/news/uk-england-42664110", "pubDate": "Sun, 14 Jan 2018 00:46:09 GMT"},
 {"title": "EgyptAir hijack: Man jumps from plane cockpit window", "description": "A man was filmed jumping from a hijacked EgyptAir plane cockpit window shortly before the hijacker surrendered.", "link": "https://www.bbc.co.uk/news/world-middle-east-3

In [35]:
type(articles[0])

__main__.Article

In [36]:
type(articles[0].__dict__)

dict

In [37]:
articles[0].__dict__

{'title': "Firm in data scandal 'made election boasts'",
 'description': 'The company that became Cambridge Analytica boasted about interfering in foreign elections, according to documents seen by the Sunday Politics.',
 'link': 'https://www.bbc.co.uk/news/uk-politics-43532916',
 'guid': 'https://www.bbc.co.uk/news/uk-politics-43532916',
 'pubDate': 'Sun, 25 Mar 2018 11:52:46 GMT'}

In [39]:
import json

In [40]:
json.dumps(articles[0].__dict__)

'{"title": "Firm in data scandal \'made election boasts\'", "description": "The company that became Cambridge Analytica boasted about interfering in foreign elections, according to documents seen by the Sunday Politics.", "link": "https://www.bbc.co.uk/news/uk-politics-43532916", "guid": "https://www.bbc.co.uk/news/uk-politics-43532916", "pubDate": "Sun, 25 Mar 2018 11:52:46 GMT"}'

In [41]:
with open("/tmp/bbc.json", "w") as f:
    for article in articles:
        json_str = json.dumps(article.__dict__)
        f.write(json_str + "\n")

In [42]:
import csv

In [47]:
with open("/data/movielens/movies.csv") as f:
    csv_reader = csv.reader(f)
    records = list(csv_reader)
records[:5]

[['movieId', 'title', 'genres'],
 ['1', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy'],
 ['2', 'Jumanji (1995)', 'Adventure|Children|Fantasy'],
 ['3', 'Grumpier Old Men (1995)', 'Comedy|Romance'],
 ['4', 'Waiting to Exhale (1995)', 'Comedy|Drama|Romance']]

In [48]:
with open("/data/movielens/movies.csv") as f:
    csv_reader = csv.DictReader(f)
    records = list(csv_reader)
records[:5]

[OrderedDict([('movieId', '1'),
              ('title', 'Toy Story (1995)'),
              ('genres', 'Adventure|Animation|Children|Comedy|Fantasy')]),
 OrderedDict([('movieId', '2'),
              ('title', 'Jumanji (1995)'),
              ('genres', 'Adventure|Children|Fantasy')]),
 OrderedDict([('movieId', '3'),
              ('title', 'Grumpier Old Men (1995)'),
              ('genres', 'Comedy|Romance')]),
 OrderedDict([('movieId', '4'),
              ('title', 'Waiting to Exhale (1995)'),
              ('genres', 'Comedy|Drama|Romance')]),
 OrderedDict([('movieId', '5'),
              ('title', 'Father of the Bride Part II (1995)'),
              ('genres', 'Comedy')])]

In [49]:
with open("/data/movielens/movies.csv") as f:
    csv_reader = csv.reader(f)
    records = list(csv_reader)

In [50]:
headers = records[0]

In [60]:
def escape_html(text):
    text = text.replace("&", "&amp;")
    text = text.replace("<", "&lt;")
    text = text.replace(">", "&gt;")
    return text

In [67]:
def create_table_row(fields, bgcolor = "white"):
    tr = '<tr bgcolor="{}">\n'.format(bgcolor)
    for field in fields:
        tr += "\t<td>{}</td>\n".format(escape_html(field))
    tr += "</tr>"
    return tr

In [68]:
print(create_table_row(headers))

<tr bgcolor="white">
	<td>movieId</td>
	<td>title</td>
	<td>genres</td>
</tr>


In [73]:
table = '<table border="1">'
for i, record in enumerate(records):
    if i == 0:
        bgcolor = "green"
    elif i % 2 == 0:
        bgcolor = "white"
    else:
        bgcolor = "steelblue"
    table += create_table_row(record, bgcolor) + "\n"
table+= '</table>'

In [74]:
open("/tmp/output.html", "w").write(table)

983553

In [2]:
import csv
filename = "/data/movielens/movies.csv"

with open(filename, encoding="utf-8") as f:
    csv_reader = csv.reader(f)
    records = list(csv_reader)

def escape_html(text):
    text = text.replace("&", "&amp;")
    text = text.replace("<", "&lt;")
    text = text.replace(">", "&gt;")
    return text
    
    
def create_table_row(fields, bgcolor = "white"):
    tr = '<tr bgcolor="{}">\n'.format(bgcolor)
    for field in fields:
        tr += "\t<td>{}</td>\n".format(escape_html(field))
    tr += "</tr>"
    return tr
    
table = '<table border="1">'
for i, record in enumerate(records):
    if i == 0:
        bgcolor = "green"
    elif i % 2 == 0:
        bgcolor = "white"
    else:
        bgcolor = "steelblue"
    table += create_table_row(record, bgcolor) + "\n"
table+= '</table>'


open("/tmp/output.html", "w").write(table)

983553