-
Notifications
You must be signed in to change notification settings - Fork 0
/
populate_db.py
96 lines (78 loc) · 3.62 KB
/
populate_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import sys
import json
import utils
import datetime
import constants
import feedparser
# Return a list for all article published now and yesterday for a RSS feed.
# url - string - URL of RSS feed.
def get_urls(url):
urls_list = []
feeds = feedparser.parse(url)
for article in feeds['entries']:
# In case something goes wrong with the extraction of the publication dates.
try:
article_date_str = '{0}-{1}-{2}'.format(
article['published_parsed'][0],
article['published_parsed'][1],
article['published_parsed'][2])
article_date = datetime.datetime.strptime(article_date_str, '%Y-%m-%d')
article_date = article_date.strftime('%Y-%m-%d')
if article_date == constants.TODAY:
urls_list.append({'title': article['title'], 'url': article['link']})
elif article_date == constants.YESTERDAY:
urls_list.append({'title': article['title'], 'url': article['link']})
except Exception as e:
print(e)
print('Error to get article')
continue
return urls_list
# Controls the tasks for handling RSS URLs.
def store_rss():
# Get a list of *.json files.
for file_feed in utils.get_json(constants.JSON_PATH):
# Open *.json file.
try:
with open(file_feed, encoding='utf-8') as data_file:
news_data = json.load(data_file)
except Exception as e:
print(e)
print('Error to open json file.')
sys.exit()
# Loop for each RSS feed on *.json file.
for feed in news_data['feeds']:
# Variable for store all URL of newspaper section on SQLite database.
urls_table = []
# Check if newspaper table exists on SQLite database.
if utils.check_table(news_data['metadata']['title']) is False:
# Create table SQLite database.
created = utils.create_table(news_data['metadata']['title'])
if created is False:
continue
else:
# Gets a list with newspaper section information of SQLite database table.
newspaper_rows = utils.rows_table(news_data['metadata']['title'])
for article_db in newspaper_rows:
# Only URL of newspaper.
urls_table.append(article_db[3])
if utils.check_table(news_data['metadata']['title']) is True:
# Gets newspaper articles list for RSS feed.
article_list = get_urls(feed['url'])
# For each article.
for article in article_list:
# Checks if a article URL no exists on list URL of SQLite database newspaper table.
if article['url'] not in urls_table:
# Insert article data on SQLite database newspaper table.
inserted = utils.insert_row(news_data['metadata']['title'], data=[feed['section'], article['title'], article['url']])
if inserted is False:
continue
# Print date, time and inserted row information.
print('{0} {1} - {2} {3}'.format(constants.TIME, feed['section'], article['title'], article['url']))
# Print article information for logs purposes.
print('{0} {1} OK\n'.format(news_data['metadata']['title'], feed['section']).upper())
def main():
utils.make_folder(constants.SQLITE_PATH)
store_rss()
print('\nFINISHED')
if __name__ == "__main__":
main()