-
Notifications
You must be signed in to change notification settings - Fork 21
/
processor.py
209 lines (167 loc) · 6.52 KB
/
processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# Copyright (c) 2023 The Brave Authors. All rights reserved.
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at https://mozilla.org/MPL/2.0/. */
import hashlib
import html
from datetime import datetime, timedelta
from urllib.parse import quote, urljoin, urlparse, urlunparse
import bleach
import dateparser
import pytz
import requests
import structlog
import unshortenit
from better_profanity import profanity
from bs4 import BeautifulSoup as BS
from fake_useragent import UserAgent
from requests.exceptions import (
ConnectTimeout,
InvalidURL,
ReadTimeout,
SSLError,
TooManyRedirects,
)
from aggregator.image_fetcher import get_article_img
from config import get_config
from db_crud import get_article
logger = structlog.getLogger(__name__)
config = get_config()
ua = UserAgent(browsers=["edge", "chrome", "firefox", "safari", "opera"])
def process_articles(article, _publisher): # noqa: C901
"""
Process the given article and return a dictionary containing the processed data.
Args:
article (dict): The article to be processed.
_publisher (dict): The publisher information.
Returns:
dict: A dictionary containing the processed data of the article.
Returns None if the article is not valid or should be skipped.
"""
out_article = {}
# Process Title of the article
if not article.get("title"):
# No title. Skip.
return None
out_article["title"] = BS(article["title"], features="html.parser").get_text()
out_article["title"] = html.unescape(out_article["title"])
# Filter the offensive articles
if profanity.contains_profanity(out_article.get("title").lower()):
return None
# Process article URL
if article.get("link"):
out_article["link"] = article["link"]
elif article.get("url"):
out_article["link"] = article["url"]
else:
return None # skip (can't find link)
parsed_article_url = urlparse(out_article["link"])
if len(parsed_article_url.path) < 4:
return None # skip (link is the same as the publisher's site URL)
# Process published time
if article.get("updated"):
out_article["publish_time"] = dateparser.parse(article.get("updated"))
elif article.get("published"):
out_article["publish_time"] = dateparser.parse(article.get("published"))
else:
return None # skip (no update field)
if out_article.get("publish_time") is None:
return None
if out_article["publish_time"].tzinfo is None:
config.tz.localize(out_article["publish_time"])
out_article["publish_time"] = out_article["publish_time"].astimezone(pytz.utc)
now_utc = datetime.now().replace(tzinfo=pytz.utc)
if _publisher["content_type"] != "product":
if out_article["publish_time"] > now_utc or out_article["publish_time"] < (
now_utc - timedelta(days=60)
):
return None # skip (newer than now() or older than 1 month)
out_article["publish_time"] = out_article["publish_time"].strftime(
"%Y-%m-%d %H:%M:%S"
)
try:
image_url = get_article_img(article)
parsed_url = urlparse(image_url)
if not parsed_url.netloc and image_url:
# If not, update the URL by joining it with the publisher's URL
image_url = urljoin(_publisher["site_url"], image_url)
if len(parsed_url.path) < 4 and image_url:
image_url = ""
out_article["img"] = image_url
except Exception as e:
logger.error(f"Error retrieving image from URL {article.get('url')}: {e}")
out_article["img"] = ""
# Add some fields
out_article["category"] = _publisher.get("category")
if article.get("description"):
out_article["description"] = BS(
article["description"], features="html.parser"
).get_text()
else:
out_article["description"] = ""
out_article["content_type"] = _publisher["content_type"]
if out_article["content_type"] == "audio":
out_article["enclosures"] = article["enclosures"]
if out_article["content_type"] == "product":
out_article["offers_category"] = article["category"]
out_article["publisher_id"] = _publisher["publisher_id"]
out_article["publisher_name"] = _publisher["publisher_name"]
out_article["channels"] = list(_publisher["channels"])
out_article["creative_instance_id"] = _publisher["creative_instance_id"]
return out_article
def unshorten_url(out_article):
"""
Unshortens a URL in the given output article.
Args:
out_article (dict): The output article containing the URL to unshorten.
Returns:
dict or None: The modified output article with the unshortened URL, or None if unshortening failed.
"""
unshortener = unshortenit.UnshortenIt(
default_timeout=config.request_timeout,
default_headers={"User-Agent": ua.random},
)
try:
out_article["url"] = unshortener.unshorten(out_article["link"])
out_article.pop("link", None)
except (
requests.exceptions.ConnectionError,
ConnectTimeout,
InvalidURL,
ReadTimeout,
SSLError,
TooManyRedirects,
):
return None, None # skip (unshortener failed)
except Exception as e:
logger.error(f"unshortener failed [{out_article.get('link')}]: {e}")
return None, None # skip (unshortener failed)
url_hash = hashlib.sha256(out_article["url"].encode("utf-8")).hexdigest()
parts = urlparse(out_article["url"])
parts = parts._replace(path=quote(parts.path))
encoded_url = urlunparse(parts)
out_article["url"] = encoded_url
out_article["url_hash"] = url_hash
processed_article = get_article(
url_hash, str(config.sources_file).replace("sources.", "")
)
if processed_article:
return None, processed_article
return out_article, None
def scrub_html(feed: dict):
"""
Scrubs the HTML content in the given feed dictionary.
Parameters:
feed (dict): The dictionary containing the HTML content to be scrubbed.
Returns:
dict: The modified feed dictionary with the HTML content scrubbed.
"""
for key in feed.keys():
try:
feed[key] = bleach.clean(feed[key], strip=True)
feed[key] = feed[key].replace(
"&", "&"
) # workaround limitation in bleach
except Exception:
feed[key] = feed[key]
return feed