/
scrpr4dou.py
executable file
·121 lines (103 loc) · 3.97 KB
/
scrpr4dou.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import datetime
import urllib.request
import pymongo
import elasticsearch
import PyPDF2
class Collection:
""" set of dou pages """
def __init__(self, nosql, url, port, dt_init, dt_final):
""" private members """
self.__nosql_name = nosql
self.__nosql_url = url
self.__nosql_port = port
self.__date_initial = dt_init
self.__date_final = dt_final
self.__dou_url = 'http://pesquisa.in.gov.br/imprensa'
self.__coding = 'ISO-8859-1'
self.__mask_type = ('dot_folder', 'slash', 'dot_file')
""" private methods"""
def __reachability(self):
response = urllib.request.urlopen(self.__dou_url)
if response.getcode() in list(range(200, 300)):
return True
return False
def __page_number(self, journal, date):
pages = 0
if self.__reachability():
url = self.__dou_url + '/jsp/visualiza/index.jsp?jornal=' + journal + '&pagina=1&data=' + date + '&captchafield=firistAccess'
response = urllib.request.urlopen(url)
page = response.read()
html = page.decode(self.__coding)
match = re.search('totalArquivos=(\d{1,4})', html)
if match:
pages = int(match.group(1))
return pages
def __date_to_str(self, date):
return '{:02}/{:02}/{:04}'.format(date.day, date.month, date.year)
def __str_to_date(self, str_date):
return datetime.date(int(str_date[6:10]), int(str_date[3:5]), int(str_date[0:2]))
def __date_range(self, begin, end):
begin_date = self.__str_to_date(begin)
end_date = self.__str_to_date(end)
period = []
if begin_date <= end_date:
for day in range((end_date - begin_date).days + 1):
date = begin_date + datetime.timedelta(day)
if date.weekday() <= 4:
period.append(self.__date_time_mask(date, self.__mask_type[1]))
return period
def __date_time_mask(self, date_time, mask_type):
if mask_type == 'dot_folder':
return date_time.strftime('%Y.%m.%d.%Hh%Mm%Ss')
elif mask_type == 'dot_file':
return date_time.strftime('%Y.%m.%d')
return date_time.strftime('%d/%m/%Y')
def __mount_url(self, begin, end):
days = self.__date_range(begin, end)
urls = []
for d in days:
for j in range(1, 4):
pages = self.__page_number(str(j), d)
for p in range(1, (pages + 1)):
urls.append((d, str(j), str(p), self.__dou_url + '/servlet/INPDFViewer?jornal=' + str(j) + '&pagina=' + str(p) + '&data=' + d + '&captchafield=firistAccess'))
return urls
""" public methods """
def to_local(self, path):
global folder
folder = self.__date_time_mask(path, self.__mask_type[0])
os.mkdir(folder)
urls = self.__mount_url(self.__date_initial, self.__date_final)
for url in urls:
filepath = folder + '/' + self.__date_time_mask(self.__str_to_date(url[0]), self.__mask_type[2]) + 'cad' + url[1] + 'pg' + url[2] + '.pdf'
urllib.request.urlretrieve(url[3], filepath)
return True
def to_nosql(self):
files = os.listdir(folder)
if self.__nosql_name == 'MongoDB':
# mongodb
mongo_conf = (self.__nosql_url, int(self.__nosql_port), 'dou', folder)
client = pymongo.MongoClient(mongo_conf[0], mongo_conf[1])
db = client[mongo_conf[2]]
collection = db[mongo_conf[3]]
# files
for f in files:
pdf = PyPDF2.PdfFileReader(folder + '/' + f)
content = pdf.getPage(0).extractText()
result = collection.insert_one({'file':f, 'folder':folder, 'content':content})
client.close()
return True
elif self.__nosql_name == 'Elasticsearch':
# elasticsearch
idx = 'dou_' + folder
es = elasticsearch.Elasticsearch(self.__nosql_url + ':' + self.__nosql_port)
for f in files:
pdf = PyPDF2.PdfFileReader(folder + '/' + f)
content = pdf.getPage(0).extractText()
doc = {'file':f, 'folder':folder, 'content':content}
result = es.index(index=idx, doc_type='dou_page', body=doc)
return True
return False