In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

from loguru import logger
import requests, datetime
from utils import *
from newspaper import Article

import spacy
ner = spacy.load("pt_core_news_md", disable=['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer'])

In [2]:
@logger.catch
def search_arquivo(query: str, _from: datetime, _to: datetime, websites: [str], max_items=2000, _type="html", fields="title,tstamp,originalURL,linkToNoFrame"):
    params = {
        "q": '"%s"' % query,
        "from": arquivo_date(_from),
        "to": arquivo_date(_to),
        "siteSearch": ",".join(websites),
        "fields": fields,
        "type": _type,
        "maxItems": max_items,
        "itemsPerSite": max_items//len(websites),
    }
    print([param for param in params.items()])
    # itemsPerSite does not work
    return query_news(params, attempts=10)


@logger.catch
def query_news(params, endpoint="https://arquivo.pt/textsearch", timeout=30, attempts=1):
    r = try_request(endpoint, params, timeout, attempts)
    if not r: return []
    return r.json()["response_items"]

@logger.catch
def process_news_piece(arquivo_item):
    logger.info("fetching %s" % arquivo_item["linkToNoFrame"])
    
    # Clean news piece (returns None if some scrapping error occured)
    processing_article = clean_response_item(arquivo_item.copy())
    if processing_article is None:
        return {"valid": False}
    
    # Check if the arquivo link is up 
    html = try_request(processing_article["url"])
    if not html:
        if html == False:  # resource will never be available
            processing_article["valid"] = False
            logger.error("%s will never be available" % (processing_article["url"]))
        return
    
    try:
        # Process using Newspaper3k Package
        article_obj = Article(processing_article["url"], _language="pt")
        article_obj.download(input_html=html.text)
        article_obj.parse()
        article_obj.nlp()
        
        entities = dict()
        for ent in ner(article_obj.summary).ents:
            if ent.label_ not in entities.keys():
                entities[ent.label_] = {}
            
            if ent.text not in entities[ent.label_].keys():
                entities[ent.label_][ent.text] = [(ent.start_char, ent.end_char)]
            else:
                entities[ent.label_][ent.text] += [(ent.start_char, ent.end_char)]
                
        processing_article["text"] = assert_valid_article(article_obj)
        processing_article["image"] = article_obj.top_image
        processing_article["summary"] = article_obj.summary
        processing_article["summary_entities"] = handle_duplicate_entities(entities)
        processing_article["valid"] = True
        
    except Exception as e:
        print(e.with_traceback())
        processing_article["valid"] = False
        
    return processing_article

# process_news_piece(news[223])

In [3]:
news = search_arquivo('*', datetime(1996, 1, 1), datetime(2020, 1, 1), ['acervo.publico.pt', 'publico.pt', 'expresso.pt', 'expresso.sapo.pt', 'www.dn.pt', 'dn.sapo.pt', 'www.cmjornal.pt'])
print(len(news))
news[225:520]

[('q', '"*"'), ('from', '19960101000000'), ('to', '20200101000000'), ('siteSearch', 'acervo.publico.pt,publico.pt,expresso.pt,expresso.sapo.pt,www.dn.pt,dn.sapo.pt,www.cmjornal.pt'), ('fields', 'title,tstamp,originalURL,linkToNoFrame'), ('type', 'html'), ('maxItems', 2000), ('itemsPerSite', 285)]
https://arquivo.pt/textsearch
2000


[{'title': 'Portugueses entre os mais pessimistas em relação a rendimentos na terceira idade - Sociedade - PUBLI',
  'originalURL': 'http://publico.pt/Sociedade/portugueses-entre-os-mais-pessimistas-em-relacao-a-rendimentos-na-terceira-idade_1429161',
  'tstamp': '20100602014656',
  'linkToNoFrame': 'https://arquivo.pt/noFrame/replay/20100602014656/http://publico.pt/Sociedade/portugueses-entre-os-mais-pessimistas-em-relacao-a-rendimentos-na-terceira-idade_1429161'},
 {'title': 'Object moved',
  'originalURL': 'http://publico.pt/notFound.aspx?aspxerrorpath=/Validation/EHX0uvAqn/c=',
  'tstamp': '20100602015124',
  'linkToNoFrame': 'https://arquivo.pt/noFrame/replay/20100602015124/http://publico.pt/notFound.aspx?aspxerrorpath=/Validation/EHX0uvAqn/c='},
 {'title': '',
  'originalURL': 'http://publico.pt/Educa%C3%A7%C3%A3o/bolsas-de-estudo-atrasadas-criam-dificuldades-a-universitarios_1423781%5B',
  'tstamp': '20100601221922',
  'linkToNoFrame': 'https://arquivo.pt/noFrame/replay/20100601

In [49]:
cache = open_cache("search_websites.json")
cache

{'acervo.publico.pt': ['20180409131724', '20190618200133'],
 'publico.pt': ['19961013180344', '20191106094752'],
 'expresso.pt': ['19980110124958', '20191106220715'],
 'expresso.sapo.pt': ['20120312160230', '20191106041515'],
 'www.dn.pt': ['19961013220426', '20200115171614'],
 'dn.sapo.pt': ['20011214065810', '20190522140701'],
 'www.cmjornal.pt': ['20100327004535', '20190624004542'],
 'jn.pt': ['19981212030154', '20200115164710'],
 'visao.sapo.pt': ['20120312160227', '20191106222001'],
 'www.jn.pt': ['19981212030154', '20200115164710'],
 'sol.sapo.pt': ['20061016125521', '20190606192858']}