Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
aniri committed Jul 24, 2017
1 parent fb9009a commit 1feefbc
Showing 1 changed file with 15 additions and 8 deletions.
23 changes: 15 additions & 8 deletions scraper.py
@@ -1,13 +1,17 @@
# encoding: utf8
import re
import datetime
import json
import scraperwiki
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor

INDEX_URL = 'http://mts.ro/proiecte-legislative-in-dezbatere-publica/'

CONTACT_TEL_FAX_PATTERN = re.compile(r'((fax|telefon|tel)[^\d]{1,10}(\d(\d| |\.){8,11}\d))')
CONTACT_EMAIL_PATTERN = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-]{2,5})")

DOC_EXTENSIONS = [".docs", ".doc", ".txt", ".crt", ".xls", ".xml", ".pdf", ".docx", ".xlsx", ]

TYPE_RULES = [
Expand Down Expand Up @@ -113,11 +117,12 @@ def parse_article(self, response):
title = self.clean_title(title)

# clean up most of the title before checking publication type
publicationText = title.lower().strip()
stop_pos = re.search(r'(pentru|privind)', publicationText)
publication_text = title.lower().strip()
publication_type = "OTHER"
stop_pos = re.search(r'(pentru|privind)', publication_text)
if stop_pos:
publicationText = publicationText[0:stop_pos.start()]
publication_type = guess_initiative_type(publicationText, TYPE_RULES)
publication_text_short = publication_text[0:stop_pos.start()]
publication_type = guess_initiative_type(publication_text_short, TYPE_RULES)

text_date = text_from(article_node.css('span.date'))
date, date_obj = self.parse_date(text_date)
Expand All @@ -134,6 +139,7 @@ def parse_article(self, response):
} for doc in
extract_documents(content_node.css('a'))
]
json_documents = json.dumps(documents)

feedback_days = None
feedback_date = self.get_feedback_date(description_without_diacritics)
Expand All @@ -142,6 +148,7 @@ def parse_article(self, response):
feedback_days = days_diff.days

contact = self.get_contacts(description_without_diacritics)
json_contact = json.dumps(contact)

publication = Publication(
institution = 'tineret',
Expand All @@ -150,8 +157,8 @@ def parse_article(self, response):
date = date,
title = title,
description = description,
#documents = documents,
#contact = contact,
documents = json_documents,
contact = json_contact,
#feedback_days = feedback_days,
#max_feedback_date = feedback_date
)
Expand Down Expand Up @@ -186,10 +193,10 @@ def get_contacts(self, text):

contact = {}

emails = re.findall(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-]{2,5})", text)
emails = re.findall(CONTACT_EMAIL_PATTERN, text)
contact['email'] = list(set(emails))

numbers = re.findall(r'((fax|telefon|tel)[^\d]{1,10}(\d(\d| |\.){8,11}\d))', text)
numbers = re.findall(CONTACT_TEL_FAX_PATTERN, text)
for number in numbers:
key = number[1]
value = number[2].replace(' ','').replace('.', '')
Expand Down

0 comments on commit 1feefbc

Please sign in to comment.