Skip to content

Commit

Permalink
updated article-PP.py
Browse files Browse the repository at this point in the history
  • Loading branch information
aroubickova committed Feb 13, 2019
1 parent 966a3c8 commit 43748cc
Showing 1 changed file with 16 additions and 9 deletions.
25 changes: 16 additions & 9 deletions newsrods/article-PP.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from logging import getLogger
from datetime import datetime
from lxml import etree

from pyspark import SparkContext, SparkConf

class Article(object):
"""
Expand All @@ -37,15 +37,21 @@ def __init__(self, source):
self.quality = None

#======= The stuff that matters: =====
# heading of the article:
self.title = self.tree.xpath('result/title/text()') #heading of the article:
# text of the article:
# HEADING OF THE ARTICLE:
# (note that the tag <title> is present twice in each article record as
# a duplicate of a kind; findtext() however returns only the first occurance
# and that sorts it
self.title = self.tree.findtext('title')
# TEXT OF THE ARTICLE (as one string):
self.content = self.tree.findtext('fulltext')
# date of publication:
# DATE of publication:
raw_date = self.tree.findtext('display-date')
self.date = datetime.strptime(raw_date, '%d-%m-%Y')
# name of the newspaper:
self.papername = self.tree.xpath('result/publisher/publisher/text()')
# NAME OF THE NEWSPAPER:
self.papername = self.tree.findtext('publisher/publisher')
# TYPE of the article:
self.type = self.tree.findtext('dnz-type')


#old stuff that doesn't match anything here:
#self.preamble = self.tree.xpath('text/text.preamble/p/wd/text()')
Expand All @@ -60,8 +66,9 @@ def words(self):
@property
def words_string(self):
"""
Return the full text of the article as a string (in NLA archive this is default). Remove all hyphens.
Return the full text of the article as a string (in PP archive this is
default). Remove all hyphens.
This merges hyphenated word but may cause problems with subordinate
clauses (The sheep - the really loud one - had just entered my office).
"""
return ' '.join(self.content).replace(' - ', '')
return self.content.replace(' - ', '')

0 comments on commit 43748cc

Please sign in to comment.