This repository has been archived by the owner on May 31, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsearticles.py
109 lines (85 loc) · 4.47 KB
/
parsearticles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# This script takes a list of HTML files of my old newspaper columns
# and scrapes them using Beautiful Soup. The result is a dictionary
# containing the stories as structured data: headline, date, & body.
# TODO: improve BS usage for performance, also pull out pull quotes
# and "write to her at" end lines.
import re
from bs4 import BeautifulSoup, UnicodeDammit
from os import path
from dateutil.parser import parse
ARCHIVEFILES = ["/home/sumanah/Documents/mcm-archive/leftovers/2005-05-26.html",
"/home/sumanah/Documents/mcm-archive/leftovers/2005-07-28.html",
"/home/sumanah/Documents/mcm-archive/leftovers/2005-08-04.html",
"/home/sumanah/Documents/mcm-archive/leftovers/2005-08-18.html",
"/home/sumanah/Documents/mcm-archive/leftovers/2005-10-09.html",
"/home/sumanah/Documents/mcm-archive/leftovers/2005-11-06.html",
"/home/sumanah/Documents/mcm-archive/leftovers/2005-11-20.html",
"/home/sumanah/Documents/mcm-archive/leftovers/2006-09-10.html",
"/home/sumanah/Documents/mcm-archive/leftovers/2007-05-20.html",
"/home/sumanah/Documents/mcm-archive/leftovers/2007-06-10.html", "../../Documents/mcm-archive/all-articles.htm"]
def parse_archival_article(element):
''' I concatenated together a big HTML file of a bunch of my columns.
This function uses Beautiful Soup to parse it for headline/body/date.'''
article_data = {}
article_data["headline"] = element.find("div", id="hd").text.strip("\n").rstrip()
trib = element.find("div", text=re.compile("The Oakland Tribune"))
article_data["date"] = trib.previous_element
article_text = ""
rights_tag = element.find("div", text=re.compile("All rights reserved"))
if rights_tag:
for paragraph in rights_tag.next_element.next_element.next_siblings:
article_text = article_text + repr(paragraph)
# text of article is after "all rights reserved" and before the Document ID
article_text = article_text.replace("</p>\'\\n\'", "</p>")
article_text = article_text.rpartition("<p>Document OKLD")[0]
article_data["body"] = article_text
else:
print(article_data["headline"] + " has no rightstag")
return article_data
def parse_leftover_article(soup):
'''I also grabbed some additional articles from another site.'''
article_data = {}
postdate = soup.find("meta", attrs={"property": "pubDate"})["content"]
article_data["date"] = repr(postdate)[1:11] # slicing the date, not time
article_data["headline"] = soup.find("meta", attrs={"property": "twitter:title"})["content"]
body = soup.find("div", id="articleViewerGroup").previous_element.contents
prose = "<p>"
important_body_elements = body[3:] # Bunch of useless scripts in the first few elements
for para in important_body_elements:
prose += repr(para).strip()
article_data["body"] = prose.replace("\\n", "").rpartition('<span fd-id="default" fd-type="end"></span>')[0]
return article_data
def parse_files(filenames, article_list):
'''Returns a date-sorted list of dictionaries, and a date-sorted list
of dates-and-headlines dicts.'''
for filename in filenames:
file_path = path.relpath(filename)
with open(file_path) as f:
newsfile = f.read()
if ".html" in file_path:
soup = BeautifulSoup(newsfile, "html5lib")
else:
cols = UnicodeDammit.detwingle(newsfile)
soup = BeautifulSoup(cols, "html5lib")
archival_articles = soup.find_all("div", attrs={"class": "article"})
for article in archival_articles:
headline = article.find("div", id="hd").text.strip("\n").rstrip()
if is_unique(headline, article_list):
article_list.append(parse_archival_article(article))
if archival_articles == []:
headline = soup.find("meta", attrs={"property": "twitter:title"})["content"]
if is_unique(headline, article_list):
article_list.append(parse_leftover_article(soup))
article_list.sort(key=lambda k: parse(k["date"]))
index = [{k: v for (k, v) in story.items() if ("date" in k) or ("headline" in k)} for story in article_list]
return (article_list, index)
def is_unique(headline, article_list):
'''Checks whether I've already grabbed this article; the archive HTML files overlap.'''
for item in article_list:
if item["headline"] == headline:
return False
return True
if __name__ == "__main__":
parse_files(ARCHIVEFILES, [])