In [1]:
processed%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import requests, datetime
from datetime import date, timedelta
from utils import *
from newspaper import Article
from IPython.display import clear_output
from urllib.parse import urlencode, quote_plus
from tqdm.notebook import tqdm
import random
from IPython.display import Image


from loguru import logger
logger.add("process.log")

import pickle

In [222]:
@logger.catch
def search_arquivo_news(query: str, _from: datetime, _to: datetime, websites: [str], max_items=2000, _type="html", fields="title,tstamp,originalURL,linkToNoFrame"):
    params = {
        "q": '"%s"' % query,
        "from": arquivo_date(_from),
        "to": arquivo_date(_to),
        "siteSearch": ",".join(websites),
        "fields": fields,
        "type": _type,
        "maxItems": max_items,
        "itemsPerSite": max_items//len(websites),
    }
    
    # itemsPerSite does not work
    return query_arquivo(params, "https://arquivo.pt/textsearch", attempts=10)

@logger.catch
def get_random_arquivo_image(query: str, _from: datetime = datetime(2018,10,1), _to: datetime = date.today(), show=False):
    params = {
        "q": '"%s"' % query,
        "from": arquivo_date(_from),
        "to": arquivo_date(_to)
    }
    
    response = query_arquivo(params, "https://arquivo.pt/imagesearch", attempts=10)
    if not response: return None
    
    chosen_index = random.randint(0, len(response) - 1)
    random_image = response[chosen_index]['imgLinkToArchive']
    
    if show: display(Image(url=random_image, width=300))
    return random_image


@logger.catch
def query_arquivo(params, endpoint, timeout=30, attempts=1):
    logger.info("[FETCHING] %s" % endpoint + '?' + urlencode(params, quote_via=quote_plus))
    
    r = try_request(endpoint, params, timeout, attempts)
    if not r: return []
    json_response = r.json()
    return json_response["response_items"] if "response_items" in json_response else json_response["responseItems"]

@logger.catch
def process_news_piece(arquivo_item):
    
    # Clean news piece (returns None if some scrapping error occured)
    processing_article = clean_response_item(arquivo_item.copy())
    if processing_article is None:
        return {"valid": False}
    
    # Check if the arquivo link is up 
    html = try_request(processing_article["url"])
    if not html:
        if html == False:  # resource will never be available
            processing_article["valid"] = False
            logger.error("%s will never be available" % (processing_article["url"]))
        return
    
    try:
        # Process using Newspaper3k Package
        article_obj = Article(processing_article["url"], _language="pt")
        article_obj.download(input_html=html.text)
        article_obj.parse()
        article_obj.nlp()
        
        processing_article["text"] = assert_valid_article(article_obj)
        processing_article["image"] = article_obj.top_image
        processing_article["summary"] = article_obj.summary
        processing_article["summary_entities"], duplicates = organize_entities(processing_article["summary"])
        processing_article["title_entities"], _ = organize_entities(processing_article["title"], duplicates)
        processing_article["valid"] = True
        
    except Exception as e:
#         print(e.with_traceback())
        logger.error("[Exception] " + str(e))
        processing_article["valid"] = False
        
    return processing_article

# process_news_piece(arquivo_response[223])

In [224]:

arquivo_response = []

# In total 588 Publico news articles.
arquivo_response += search_arquivo_news('', datetime(2018,10,1), date.today(), ['publico.pt noticia -js ', 'acervo.publico.pt noticia -js ']) 

# In total 553 Expresso news articles.
arquivo_response += search_arquivo_news('', datetime(2018,10,1), date.today(), ['expresso.pt 2018', 'expresso.pt 2019', 'expresso.pt 2020', 'expresso.sapo.pt 2018', 'expresso.sapo.pt 2019', 'expresso.sapo.pt 2020'])

# In total 37 DN news articles.
arquivo_response += search_arquivo_news('', datetime(2018,10,1), date.today(), ['www.dn.pt politica', 'www.dn.pt sociedade', 'dn.sapo.pt politica', 'dn.sapo.pt sociedade'])

# In total 101 DN news articles.
arquivo_response += search_arquivo_news('', datetime(2018,10,1), date.today(), ['visao.sapo.pt 2018', 'visao.sapo.pt 2019', 'visao.sapo.pt 2020'])

print("In total", len(arquivo_response), "arquivo items. The first 5 items are:\n")
print(arquivo_response[:5])

2021-04-20 01:25:31.430 | INFO     | __main__:query_arquivo:37 - [FETCHING] https://arquivo.pt/textsearch?q=%22%22&from=20181001000000&to=20210420000000&siteSearch=publico.pt+noticia+-js+%2Cacervo.publico.pt+noticia+-js+&fields=title%2Ctstamp%2CoriginalURL%2ClinkToNoFrame&type=html&maxItems=2000&itemsPerSite=1000
2021-04-20 01:25:33.552 | INFO     | __main__:query_arquivo:37 - [FETCHING] https://arquivo.pt/textsearch?q=%22%22&from=20181001000000&to=20210420000000&siteSearch=expresso.pt+2018%2Cexpresso.pt+2019%2Cexpresso.pt+2020%2Cexpresso.sapo.pt+2018%2Cexpresso.sapo.pt+2019%2Cexpresso.sapo.pt+2020&fields=title%2Ctstamp%2CoriginalURL%2ClinkToNoFrame&type=html&maxItems=2000&itemsPerSite=333
2021-04-20 01:26:03.646 | ERROR    | utils:try_request:34 - [HTTPSConnectionPool(host='arquivo.pt', port=443): Read timed out. (read timeout=30)] for [{'q': '""', 'from': '20181001000000', 'to': '20210420000000', 'siteSearch': 'expresso.pt 2018,expresso.pt 2019,expresso.pt 2020,expresso.sapo.pt 2018,

In total 1279 arquivo items. The first 5 items are:

[{'title': 'Praga de baratas “grandes” aflige Parque das Nações | Lisboa | PÚBLICO', 'originalURL': 'http://publico.pt/2019/08/29/local/noticia/baratas-parque-nacoes-1884816', 'tstamp': '20190831150457', 'linkToNoFrame': 'https://arquivo.pt/noFrame/replay/20190831150457/http://publico.pt/2019/08/29/local/noticia/baratas-parque-nacoes-1884816'}, {'title': 'Mais de 50 crianças adoptadas foram devolvidas nos últimos três anos | Adopção | PÚBLICO', 'originalURL': 'http://publico.pt/2019/07/30/sociedade/noticia/ultimos-tres-anos-devolvidas-53-criancas-adopcao-1881728', 'tstamp': '20190831150900', 'linkToNoFrame': 'https://arquivo.pt/noFrame/replay/20190831150900/http://publico.pt/2019/07/30/sociedade/noticia/ultimos-tres-anos-devolvidas-53-criancas-adopcao-1881728'}, {'title': 'Empresas de Portugal e Galiza suspeitas de se apropriarem de 20 milhões de fundos europeus | Justiça', 'originalURL': 'http://publico.pt/2019/08/03/sociedade/notic

In [225]:
# This script takes about 7 mins to run 
processed = []

logger.info("[START PROCESSING]")
for arquivo_item in tqdm(arquivo_response):
    logger.info("[PROCESS] Article %s" % arquivo_item['linkToNoFrame'])
    processed.append(process_news_piece(arquivo_item))
    
# Pickle processed news
with open('processed.pkl', 'wb') as f:
    pickle.dump(processed, f)
    
def default(o):
    if isinstance(o, (datetime)):
        return o.isoformat()
    
# Also output it to a JSON file
with open('processed.json', 'w') as outfile:
    json.dump(processed, outfile, default=default)

2021-04-20 01:27:27.331 | INFO     | __main__:<module>:3 - [START PROCESSING]


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1279.0), HTML(value='')))

2021-04-20 01:27:27.411 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831150457/http://publico.pt/2019/08/29/local/noticia/baratas-parque-nacoes-1884816
2021-04-20 01:27:29.265 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831150900/http://publico.pt/2019/07/30/sociedade/noticia/ultimos-tres-anos-devolvidas-53-criancas-adopcao-1881728
2021-04-20 01:27:29.848 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831150704/http://publico.pt/2019/08/03/sociedade/noticia/espanha-descobre-fraude-fundos-europeus-galiza-portugal-1882260
2021-04-20 01:27:30.425 | INFO     | utils:is_duplicate:198 - DUP [Unidade de Polícia Judiciária Orgânica da Guarda Civil de Pontevedra, Polícia Judiciária, s1=0.20, s2=1.00, ji=0.20] : Polícia Judiciária
2021-04-20 01:27:30.456 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905204112/htt

2021-04-20 01:27:43.768 | INFO     | utils:is_duplicate:198 - DUP [Lisboa Stargate, Lisboa, s1=0.50, s2=1.00, ji=0.50] : Lisboa
2021-04-20 01:27:43.774 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190325230526/https://acervo.publico.pt/fugas/noticia/moro-numa-ilha-subtropical-1733129
2021-04-20 01:27:44.066 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 19977
2021-04-20 01:27:44.068 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190314164023/https://acervo.publico.pt/culturaipsilon/noticia/comissao-portuguesa-das-sepulturas-de-guerra-1668608
2021-04-20 01:27:44.376 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190314164036/https://acervo.publico.pt/culturaipsilon/noticia/estatua-do-soldado-desconhecido-de-lamego-1667769
2021-04-20 01:27:44.627 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/rep

2021-04-20 01:27:56.392 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 33481
2021-04-20 01:27:56.395 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190925120908/https://acervo.publico.pt/politica/noticia/um-ano-visto-por-antonio-barreto-1680515
2021-04-20 01:27:56.655 | INFO     | utils:is_duplicate:198 - DUP [António Barreto, Barreto, s1=0.50, s2=1.00, ji=0.50] : Barreto
2021-04-20 01:27:56.660 | INFO     | utils:organize_entities:259 - FDUP [Barreto, António Barreto] : Barreto
2021-04-20 01:27:56.662 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831142601/http://publico.pt/2019/06/01/fugas/noticia/resende-filao-cerejas-1874564
2021-04-20 01:27:57.196 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190829090949/https://acervo.publico.pt/mundo/noticia/--e-o-ritmo-e-de-boi---1687742
2021-04-20 01:27:57.633 | INFO     | __main_

2021-04-20 01:28:09.507 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831120017/http://publico.pt/2019/08/30/culturaipsilon/noticia/casorio-legendary-tigerman-calcuta-1884843
2021-04-20 01:28:09.950 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905204955/http://publico.pt/2019/07/10/ciencia/noticia/sairam-africa-cedo-pensava-1879323
2021-04-20 01:28:10.371 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831135056/http://publico.pt/2019/08/23/culto/noticia/cao-companhia-saude-estudo-1884263
2021-04-20 01:28:10.798 | INFO     | utils:is_duplicate:198 - DUP [Mayo Clinic Proceedings, Mayo Clinic, s1=0.67, s2=1.00, ji=0.67] : Mayo Clinic
2021-04-20 01:28:10.803 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190902112541/https://acervo.publico.pt/fugas/noticia/ilhas-o-cafe-da-faja-dos-vimes-1733057
2021-04-20 01:28:1

2021-04-20 01:28:20.673 | INFO     | utils:is_duplicate:198 - DUP [PSV Eindhoven, PSV, s1=0.50, s2=1.00, ji=0.50] : PSV Eindhoven
2021-04-20 01:28:20.677 | INFO     | utils:organize_entities:259 - FDUP [PSV Eindhoven, PSV] : PSV Eindhoven
2021-04-20 01:28:20.679 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905192753/http://publico.pt/2019/08/30/economia/noticia/rendas-vao-aumentar-05-2020-1884921
2021-04-20 01:28:21.082 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905224516/http://publico.pt/2019/05/23/fugas/noticia/caracois-cerveja-beiratejo-siga-cacilhas-1873873
2021-04-20 01:28:21.535 | INFO     | utils:is_duplicate:198 - DUP [Cacilhas, Foto Cacilhas, s1=1.00, s2=0.50, ji=0.50] : Foto Cacilhas
2021-04-20 01:28:21.538 | INFO     | utils:organize_entities:259 - FDUP [Foto Cacilhas, Cacilhas] : Foto Cacilhas
2021-04-20 01:28:21.541 | INFO     | __main__:<module>:5 - [PROCESS] Article https://a

2021-04-20 01:28:33.366 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 33083
2021-04-20 01:28:33.368 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831115656/http://publico.pt/2019/08/30/fugas/noticia/sogevinus-quinta-luiz-enoturismo-douro-1884998
2021-04-20 01:28:33.828 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190925120922/https://acervo.publico.pt/sociedade/noticia/um-ano-a-aprender-a-viver-sem-catarina-1679867
2021-04-20 01:28:34.059 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 10237
2021-04-20 01:28:34.061 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905195051/http://publico.pt/2019/02/16/fugas/noticia/torna-pinhao-portugues-tao-valioso-1861483
2021-04-20 01:28:34.535 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 11193
2021-04-20 01:28:34.53

2021-04-20 01:28:45.094 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831141018/http://publico.pt/2019/08/27/fugas/noticia/festival-vila-bispo-algarve-gastronomia-perceves-1884613
2021-04-20 01:28:45.517 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831120108/http://publico.pt/2019/08/30/desporto/noticia/sporting-vence-benfica-conquista-supertaca-futsal-1885060
2021-04-20 01:28:45.986 | INFO     | utils:is_duplicate:198 - DUP [Supertaça de futsal, Supertaça, s1=0.33, s2=1.00, ji=0.33] : Supertaça
2021-04-20 01:28:45.991 | INFO     | utils:organize_entities:259 - FDUP [Supertaça, Supertaça de futsal] : Supertaça
2021-04-20 01:28:45.993 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905224649/http://publico.pt/2019/05/19/economia/noticia/tst-greve-hoje-segundafeira-aumentos-salariais-1873243
2021-04-20 01:28:46.415 | INFO     | utils:is_duplicate:198 - 

2021-04-20 01:28:57.668 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831141713/http://publico.pt/2019/08/23/fugas/noticia/avenida-voltou-espaco-cultural-referencia-aveiro-1884230
2021-04-20 01:28:58.147 | INFO     | utils:is_duplicate:198 - DUP [Avenida Café-Concerto, Avenida, s1=0.50, s2=1.00, ji=0.50] : Avenida Café-Concerto
2021-04-20 01:28:58.152 | INFO     | utils:organize_entities:259 - FDUP [Avenida Café-Concerto, Avenida] : Avenida Café-Concerto
2021-04-20 01:28:58.154 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831140111/http://publico.pt/2019/08/26/culto/noticia/miguel-quenia-veio-portugal-marca-roupa-1884252
2021-04-20 01:28:58.588 | INFO     | utils:is_duplicate:198 - DUP [Miguel Marques da Costa, Miguel, s1=0.25, s2=1.00, ji=0.25] : Miguel
2021-04-20 01:28:58.595 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831120957/http://publico.p

2021-04-20 01:29:07.041 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 37606
2021-04-20 01:29:07.043 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190615011323/https://acervo.publico.pt/opiniao/noticia/quem-acredita-no-impossivel-1725295
2021-04-20 01:29:07.395 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190620014805/https://acervo.publico.pt/mundo/noticia/religiao-em-alta-velocidade-1629398
2021-04-20 01:29:07.814 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190616010357/https://acervo.publico.pt/opiniao/noticia/quem-acredita-no-impossivel-1725295
2021-04-20 01:29:08.188 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190620014600/https://acervo.publico.pt/mundo/noticia/comboio-pobre-comboio-rico-1631270
2021-04-20 01:29:08.546 | INFO     | __main__:<module>:5 - [PROCESS] Article h

2021-04-20 01:29:15.569 | INFO     | utils:is_duplicate:198 - DUP [Reino Unido da UE, Reino Unido, s1=0.50, s2=1.00, ji=0.50] : Reino Unido
2021-04-20 01:29:15.576 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831141559/http://publico.pt/2019/08/23/fugas/noticia/sandinha-francesinha-come-mao-chegou-centro-cidade-1883389
2021-04-20 01:29:16.031 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905191324/http://publico.pt/2019/08/25/desporto/noticia/daria-bilodid-sporting-nova-bicampea-mundial-judo-1884408
2021-04-20 01:29:16.412 | INFO     | utils:is_duplicate:198 - DUP [Daria Bilodid, Bilodid, s1=0.50, s2=1.00, ji=0.50] : Bilodid
2021-04-20 01:29:16.413 | INFO     | utils:is_duplicate:198 - DUP [Daria Bilodid, Daria, s1=0.50, s2=1.00, ji=0.50] : Daria
2021-04-20 01:29:16.414 | ERROR    | __main__:process_news_piece:76 - [Exception] 'Daria Bilodid'
2021-04-20 01:29:16.417 | INFO     | __main__:<modul

2021-04-20 01:29:27.547 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905194612/http://publico.pt/2019/08/24/mundo/noticia/cimeira-g7-macron-trump-prometem-debater-tabus-1884245
2021-04-20 01:29:27.990 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905191221/http://publico.pt/2019/08/30/desporto/noticia/novo-campeao-mundo-quer-recebido-mundo-dancar-1884973
2021-04-20 01:29:28.397 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831135754/http://publico.pt/2019/08/14/culto/noticia/estudo-defende-expressoes-belicas-excluidas-tratamento-cancro-1883386
2021-04-20 01:29:28.810 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831132851/http://publico.pt/2019/07/16/fugas/noticia/aldeias-xisto-segundo-dark-sky-certificado-pais-1880140
2021-04-20 01:29:29.230 | INFO     | __main__:<module>:5 - [PROCESS] Article https://a

2021-04-20 01:29:38.406 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831145157/http://publico.pt/2019/08/29/mundo/noticia/bolsonaro-proibe-queimadas-brasil-durante-60-dias-1884784
2021-04-20 01:29:38.868 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905201112/http://publico.pt/2019/08/30/desporto/noticia/v-guimaraes-patrocina-regresso-bas-dost-portugal-1885043
2021-04-20 01:29:39.263 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831114625/http://publico.pt/2019/08/31/p3/noticia/artista-transforma-arte-lixo-recolhido-praias-ericeira-1885075
2021-04-20 01:29:39.696 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831141530/http://publico.pt/2019/08/24/fugas/noticia/meianau-ha-boa-grelha-sardinha-pinga-pao-1883997
2021-04-20 01:29:40.120 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/n

2021-04-20 01:29:50.569 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831121103/http://publico.pt/2019/08/29/tecnologia/noticia/vendas-publico-crescem-assinaturas-online-superam-compras-banca-1884869
2021-04-20 01:29:50.997 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831114730/http://publico.pt/2019/08/31/p3/noticia/finlandia-quer-presidencia-ue-menos-plastico-transportes-publicos-1885072
2021-04-20 01:29:51.448 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905221714/http://publico.pt/2018/03/02/sociedade/noticia/direccaogeral-regulariza-pagamentos-de-inspectores-veterinarios-do-oeste-1805106
2021-04-20 01:29:51.880 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905205431/http://publico.pt/2019/08/07/ciencia/noticia/origem-gado-crioulo-americas-nao-afinal-so-iberica-1882699
2021-04-20 01:29:52.310 | INFO

2021-04-20 01:30:01.834 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905201342/http://publico.pt/2019/08/27/culto/noticia/fotografias-consorte-rei-tailandes-fizeram-cair-site-palacio-1884619
2021-04-20 01:30:02.276 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831145338/http://publico.pt/2019/08/27/mundo/noticia/ministro-brasileiro-rejeita-ajuda-g7-amazonia-lembra-notredame-1884547
2021-04-20 01:30:02.674 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905204005/http://publico.pt/2019/08/29/sociedade/noticia/inem-avanca-compra-ambulancias-apos-libertacao-verbas-governo-1884885
2021-04-20 01:30:03.109 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831120912/http://publico.pt/2019/08/30/sociedade/noticia/infarmed-pede-intervencao-pj-uso-indevido-medicamentos-testosterona-1885037
2021-04-20 01:30:03.520 | INFO

2021-04-20 01:30:12.209 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190325224900/https://acervo.publico.pt/mundo/noticia/a-colonia-onde-todas-as-fatumata-tinham-de-se-chamar-maria-1716239
2021-04-20 01:30:12.624 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 52659
2021-04-20 01:30:12.627 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181015083002/https://acervo.publico.pt/culturaipsilon/noticia/dois-filhos-de-leotte-do-rego-que-combateram-em-la-lys-1668803
2021-04-20 01:30:12.963 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181015083553/https://acervo.publico.pt/local/noticia/a-camara-do-porto-quer-as-suas-fotografias-antigas-do-porto-1707314
2021-04-20 01:30:13.265 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181006200400/https://acervo.publico.pt/culturaipsilon/noticia/o-factor-

2021-04-20 01:30:24.354 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831121301/http://publico.pt/2019/08/30/ciencia/noticia/humanos-ja-alteravam-substancialmente-ambiente-ha-tres-mil-anos-1884913
2021-04-20 01:30:24.730 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831141705/http://publico.pt/2019/08/23/fugas/noticia/revista-time-elege-hostel-portugues-100-melhores-lugares-mundo-1884244
2021-04-20 01:30:25.199 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831134216/http://publico.pt/2019/08/25/mundo/noticia/ocidente-assustouse-sistema-credito-social-chines-afinal-nao-existe-1883277
2021-04-20 01:30:25.616 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905201104/http://publico.pt/2019/08/30/desporto/noticia/antonio-salvador-suspenso-132-dias-braga-decisao-injusta-ilegal-1885052
2021-04-20 01:30:26.032 | IN

2021-04-20 01:30:35.361 | INFO     | utils:is_duplicate:198 - DUP [Baixo Sabor, Planalto de Ordenamento da Albufeira do Baixo Sabor, s1=1.00, s2=0.25, ji=0.25] : Planalto de Ordenamento da Albufeira do Baixo Sabor
2021-04-20 01:30:35.365 | INFO     | utils:organize_entities:259 - FDUP [Planalto de Ordenamento da Albufeira do Baixo Sabor, Baixo Sabor] : Planalto de Ordenamento da Albufeira do Baixo Sabor
2021-04-20 01:30:35.368 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190831150317/http://publico.pt/2019/08/29/politica/noticia/assuncao-cristas-incomodame-quererem-colar-cds-partido-nao-nada-cds-1884793
2021-04-20 01:30:35.815 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905213319/http://publico.pt/2019/08/17/sociedade/noticia/picada-caravela-impede-alemao-atravessar-costa-sul-ilha-terceira-nado-1883675
2021-04-20 01:30:36.224 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.p

2021-04-20 01:30:45.808 | INFO     | utils:is_duplicate:198 - DUP [Comunidade Intermunicipal do Médio Tejo, Médio Tejo, s1=0.40, s2=1.00, ji=0.40] : Comunidade Intermunicipal do Médio Tejo
2021-04-20 01:30:45.809 | INFO     | utils:is_duplicate:198 - DUP [Base de Tancos, Tancos, s1=0.33, s2=1.00, ji=0.33] : Tancos
2021-04-20 01:30:45.815 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905223425/http://publico.pt/2017/07/17/sociedade/noticia/cerca-de-150-operacionais-vao-reforcar-combate-ao-fogo-de-alijo-1779352
2021-04-20 01:30:46.235 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905224015/http://publico.pt/2018/01/26/culturaipsilon/noticia/mafalda-veiga-em-solo-absoluto-por-causa-de-uma-guitarra-azul-1800712
2021-04-20 01:30:46.655 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905230254/http://publico.pt/2015/02/23/local/noticia/rodoviaria-do-tejo-inv

2021-04-20 01:30:54.761 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905194056/http://publico.pt/2019/08/19/sociedade/noticia/ha-menos-acucar-refrigerantes-portugueses-quebra-vendas-quase-nao-afecta-bebidas-acucaradas-1883807
2021-04-20 01:30:55.191 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190702012137/https://acervo.publico.pt/sociedade/noticia/vais-para-te-salvar-tentando-salvar-os-outros-1735031
2021-04-20 01:30:55.494 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190725033536/https://acervo.publico.pt/culturaipsilon/noticia/senhor-bellow-senhor-roth-a-grande-pastoral-americana-1745689
2021-04-20 01:30:55.938 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 33083
2021-04-20 01:30:55.941 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190726004326/https://acervo.publico.pt/soc

2021-04-20 01:31:05.344 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181113200403/https://acervo.publico.pt/culturaipsilon/noticia/franca-criou-o-cinema-e-nao-descansou-ao-setimo-dia-1705004
2021-04-20 01:31:06.120 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 23198
2021-04-20 01:31:06.123 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20200117015157/https://acervo.publico.pt/culturaipsilon/noticia/o-ano-vivido-por-marfox-da-quinta-do-mocho-para-o-moma-de-nova-iorque-1680546
2021-04-20 01:31:06.435 | INFO     | utils:is_duplicate:198 - DUP [DJ Marfox, Marfox, s1=0.50, s2=1.00, ji=0.50] : Marfox
2021-04-20 01:31:06.440 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190320000204/https://acervo.publico.pt/fugas/noticia/ilhas-ptst-um-museu-inteiro-construido-por-um-unico-homem-1733150
2021-04-20 01:31:06.772 | INFO     | __main__:<

2021-04-20 01:31:15.151 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190103210221/https://acervo.publico.pt/culturaipsilon/noticia/a-nova-galeria-do-museu-de-arte-antiga-foi-feita-para-seduzir-1738157
2021-04-20 01:31:15.571 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 11294
2021-04-20 01:31:15.573 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190202040821/https://acervo.publico.pt/mundo/noticia/hamburgo-vive-virada-para-o-porto-e-isso-faz-toda-a-diferenca-1707523
2021-04-20 01:31:15.980 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 21685
2021-04-20 01:31:15.982 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190620014543/https://acervo.publico.pt/mundo/noticia/a-primeira-accao-do-governo-orban-recomecar-polemico-monumento-da-ii-guerra-1631709
2021-04-20 01:31:16.349 | INFO     | __main__:<

2021-04-20 01:31:24.519 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190318184357/https://expresso.pt/politica/2019-03-18-Bouum-Quantas-bombas-eleitorais-lancou-Antonio-Costa--1
2021-04-20 01:31:24.793 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 11347
2021-04-20 01:31:24.795 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190319183039/https://expresso.pt/politica/2019-03-18-Bouum-Quantas-bombas-eleitorais-lancou-Antonio-Costa--1
2021-04-20 01:31:25.086 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 11347
2021-04-20 01:31:25.088 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181214213524/https://expresso.pt/sociedade/2018-12-14-Hospital-Sta.-Maria-recebe-equipamentos-de-imagiologia-ineditos-na-Peninsula-Iberica
2021-04-20 01:31:25.356 | INFO     | utils:is_duplicate:198 - DUP [Lisboa, Centro 

2021-04-20 01:31:31.898 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191215200432/https://expresso.pt/revista-de-imprensa/2019-12-14-OE-2020-traz-corte-nas-taxas-moderadoras
2021-04-20 01:31:32.066 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191009050216/https://expresso.pt/economia/2017-09-20-Economistas-independentes-querem-aperto-orcamental-mais-suave
2021-04-20 01:31:32.260 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181219085623/https://expresso.pt/economia/2018-12-17-Mercados-a-espera-do-que-vai-fazer-a-Fed-em-2019
2021-04-20 01:31:32.525 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181227103724/https://expresso.pt/economia/2018-12-17-Mercados-a-espera-do-que-vai-fazer-a-Fed-em-2019
2021-04-20 01:31:32.762 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/2019060619324

2021-04-20 01:31:38.914 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 10044
2021-04-20 01:31:38.916 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191123203407/https://expresso.pt/internacional/2019-11-19-Brasil-ira-aplicar-em-2020-tecnica-que-esteriliza-o-mosquito-do-zika-e-dengue
2021-04-20 01:31:39.101 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190101005604/https://expresso.pt/internacional/2018-12-31-OMS-espera-que-2019-seja-ano-de-reforco-de-verbas-para-combate-a-malaria
2021-04-20 01:31:39.359 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191121010619/https://expresso.pt/economia/2019-11-19-EasyJet-cresce-9-em-Portugal-onde-espera-ter-mais-um-aviao-a-voar-em-2020
2021-04-20 01:31:39.556 | INFO     | utils:is_duplicate:198 - DUP [Portugal, em Portugal, s1=1.00, s2=0.50, ji=0.50] : em Portugal
2021-04-20 01:31:39.560 |

2021-04-20 01:31:45.860 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190710143405/https://expresso.pt/revista-de-imprensa/2019-07-10-PPP-na-Saude-Estado-gastou-4691-milhoes-de-euros-em-2018-o-valor-mais-alto-de-sempre
2021-04-20 01:31:46.037 | INFO     | utils:is_duplicate:198 - DUP [Entidades Gestoras, Entidades Gestoras dos Edifícios, s1=1.00, s2=0.50, ji=0.50] : Entidades Gestoras dos Edifícios
2021-04-20 01:31:46.043 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190124020037/https://expresso.pt/economia/2019-01-15-Miguel-Frasquilho-A-pontualidade-da-TAP-foi-muito-fraca-em-2018-devemos-assumi-lo-e-pedir-desculpa
2021-04-20 01:31:46.274 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190102104140/https://expresso.pt/sociedade/2018-12-17-Acontecimentos-e-figuras-de-2018.-Na-votacao-online-em-curso-Tancos-e-Pedro-Santana-Lopes-lideram-escolha-dos-leitores
2021

2021-04-20 01:31:51.699 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 16425
2021-04-20 01:31:51.702 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191204074419/https://expresso.pt/internacional/2018-11-01-Esperar-sem-saber-pelo-que-se-espera
2021-04-20 01:31:51.957 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 16425
2021-04-20 01:31:51.960 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190210183206/https://expresso.sapo.pt/autores/2018-05-08-Vitor-Matos
2021-04-20 01:31:52.275 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too short [Disponível na capa da Revista E

Se ainda não tem acesso]
2021-04-20 01:31:52.277 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190320060704/https://expresso.pt/revista-de-imprensa/2019-03-14-Grupo-de-ativos-toxicos-do-Novo-Banco-arranca-

2021-04-20 01:31:58.322 | ERROR    | __main__:process_news_piece:76 - [Exception] 'José Tolentino de Mendonça'
2021-04-20 01:31:58.324 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190522131434/https://expresso.pt/europeias-2019/2019-05-17-Novos-ventos-sopram-da-Eslovaquia-para-enfrentar-corrupcao-e-populismo
2021-04-20 01:31:58.521 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191204103347/https://expresso.pt/economia/2019-11-04-Efanor-propoe-Angelo-Pauperio-para-a-administracao-da-Sonae-Capital
2021-04-20 01:31:58.734 | INFO     | utils:is_duplicate:198 - DUP [Sonae, Sonae SGPS, s1=1.00, s2=0.50, ji=0.50] : Sonae
2021-04-20 01:31:58.739 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190320055910/https://expresso.pt/economia/2019-03-12-Lucros-de-mil-milhoes-na-EDP-deverao-voltar-em-2022
2021-04-20 01:31:58.922 | INFO     | __main__:<module>:5 - [PROCESS] Arti

2021-04-20 01:32:03.521 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190219051629/https://expresso.pt/economia/2019-02-15-Lidl-investe-70-milhoes-em-Santo-Tirso-1
2021-04-20 01:32:03.762 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190218044655/https://expresso.pt/economia/2019-02-15-Lidl-investe-70-milhoes-em-Santo-Tirso-1
2021-04-20 01:32:04.041 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190710221024/https://expresso.pt/economia/2019-04-15-Centeno-mantem-defice-de-02-apesar-de-rever-o-crescimento-para-19
2021-04-20 01:32:04.267 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190523021420/https://expresso.pt/economia/2019-04-15-Centeno-mantem-defice-de-02-apesar-de-rever-o-crescimento-para-19
2021-04-20 01:32:04.498 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190325053

2021-04-20 01:32:12.609 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190408042100/https://expresso.pt/economia/2018-12-20-Rendas-vao-ter-desconto-no-IRS.-Saiba-o-que-vai-mudar
2021-04-20 01:32:12.881 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181011104215/https://expresso.sapo.pt/economia/2018-09-30-Mercados-em-setembro-aguentaram-se.-Mas-riscos-engordaram
2021-04-20 01:32:13.149 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190930193410/https://expresso.pt/economia/2019-09-23-Nao-queremos-ajudas-do-Estado
2021-04-20 01:32:13.400 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191123092640/https://expresso.pt/economia/2019-11-12-PIB-so-tem-que-crescer-04-para-alcancar-meta-do-Governo-de-19
2021-04-20 01:32:13.648 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190313185631/h

2021-04-20 01:32:21.063 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190127111227/https://expresso.pt/internacional/2019-01-14-Theresa-pede-ajuda-a-TINA-para-salvar-o-Brexit
2021-04-20 01:32:21.425 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190115184158/https://expresso.pt/internacional/2019-01-14-Theresa-pede-ajuda-a-TINA-para-salvar-o-Brexit
2021-04-20 01:32:21.693 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190326055709/https://expresso.pt/economia/2019-03-23-Dois-sinais-de-panico-financeiro-que-vieram-da-Alemanha-e-Estados-Unidos
2021-04-20 01:32:21.997 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190323200609/https://expresso.pt/economia/2019-03-23-Dois-sinais-de-panico-financeiro-que-vieram-da-Alemanha-e-Estados-Unidos
2021-04-20 01:32:22.272 | INFO     | __main__:<module>:5 - [PROCESS] Article https://a

2021-04-20 01:32:28.222 | INFO     | utils:is_duplicate:198 - DUP [Fundo Monetário Internacional, Fundo, s1=0.33, s2=1.00, ji=0.33] : Fundo
2021-04-20 01:32:28.227 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190319055405/https://expresso.pt/economia/2019-03-14-Conselho-das-Financas-Publicas-fase-de-expansao-da-economia-portuguesa-aparenta-ter-terminado
2021-04-20 01:32:28.482 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191224011008/https://expresso.pt/sociedade/2019-12-02-Oito-premios-para-o-melhor--do-turismo-portugues
2021-04-20 01:32:28.692 | INFO     | utils:is_duplicate:198 - DUP [Hotelaria, Escola Superior de Hotelaria, s1=1.00, s2=0.25, ji=0.25] : Hotelaria
2021-04-20 01:32:28.697 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191203211616/https://expresso.pt/sociedade/2019-12-02-Oito-premios-para-o-melhor--do-turismo-portugues
2021-04-20 01:32:28.8

2021-04-20 01:32:35.722 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190308060321/https://expresso.pt/internacional/2019-02-22-Ministro-japones-teve-que-pedir-desculpa-apos-chegar-3-minutos-atrasado
2021-04-20 01:32:35.985 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190319045005/https://expresso.pt/internacional/2019-02-22-Ministro-japones-teve-que-pedir-desculpa-apos-chegar-3-minutos-atrasado
2021-04-20 01:32:36.259 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190802184021/https://expresso.pt/economia/2019-08-02-Textil.-Belgas-que-vestiram-O-Padrinho-investem-em-Mangualde
2021-04-20 01:32:36.493 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905020709/https://expresso.pt/politica/2018-07-15-Movimento-pelo-Interior-dividido-com-medidas-do-Governo.-Treta-ou-sinal-de-esperanca-
2021-04-20 01:32:36.670 | INFO     

2021-04-20 01:32:42.655 | INFO     | utils:is_duplicate:198 - DUP [em Portugal, Portugal, s1=0.50, s2=1.00, ji=0.50] : Portugal
2021-04-20 01:32:42.660 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190313061143/https://expresso.pt/internacional/2019-02-06-Estado-da-Uniao-Trump-insiste-num-muro-e-tenta-galgar-outro
2021-04-20 01:32:42.971 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 10250
2021-04-20 01:32:42.973 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190206182955/https://expresso.pt/internacional/2019-02-06-Estado-da-Uniao-Trump-insiste-num-muro-e-tenta-galgar-outro
2021-04-20 01:32:43.204 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 10250
2021-04-20 01:32:43.206 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190930205534/https://expresso.pt/politica/2019-09-18-Fact-check.-Quatro-que

2021-04-20 01:32:50.093 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190320051801/https://expresso.pt/economia/2019-03-04-Novo-Banco-ganha-novo-responsavel-pelas-financas-apos-pedido-ao-Fundo-de-Resolucao
2021-04-20 01:32:50.414 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190331045457/https://expresso.pt/economia/2019-03-04-Novo-Banco-ganha-novo-responsavel-pelas-financas-apos-pedido-ao-Fundo-de-Resolucao
2021-04-20 01:32:50.688 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181009192722/https://expresso.sapo.pt/economia/2018-09-30-Mercados-em-setembro-aguentaram-se.-Mas-riscos-engordaram
2021-04-20 01:32:50.991 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181217095205/https://expresso.pt/internacional/2018-12-09-Brexit.-Euroceticos-e-trabalhistas-manifestam-se-separadamente-em-Londres
2021-04-20 01:32:51.234 | IN

2021-04-20 01:32:56.898 | INFO     | utils:is_duplicate:198 - DUP [Orçamento do Estado, Orçamento, s1=0.33, s2=1.00, ji=0.33] : Orçamento
2021-04-20 01:32:56.904 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191113200204/https://expresso.pt/internacional/2019-11-13-Venezuela.-ONU-precisa-de-122-mil-ME-para-ajudar-migrantes-e-paises-anfitrioes
2021-04-20 01:32:57.089 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190313193522/https://expresso.pt/blogues/blogue_sem_cerimonia/2019-03-12-A-lista-socratica-do-PS-e-as-propostas-fake-de-Pedro-Marques
2021-04-20 01:32:57.337 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181202201101/https://expresso.sapo.pt/economia/2018-12-02-Trump-e-Xi-concordam-em-tregua-na-guerra-comercial
2021-04-20 01:32:57.616 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181001173804/https://expresso.

2021-04-20 01:33:03.893 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190523011507/https://expresso.pt/economia/2019-04-01-CEO-RESET.-Em-que-investiria-a-Isabel-Vaz-se-tivesse-de-comecar-de-novo-
2021-04-20 01:33:04.088 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190806183745/https://expresso.pt/sociedade/2019-08-06-Governo-reconhece-incendios-de-Vila-de-Rei-e-Macao-como-catastrofe-natural
2021-04-20 01:33:04.325 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190906195737/https://expresso.pt/economia/2019-09-05-Portugal-autorizado-a-avancar-com-reembolso-antecipado-do-emprestimo-europeu
2021-04-20 01:33:04.570 | INFO     | utils:is_duplicate:198 - DUP [FEEF, Conselho de Administração do FEEF, s1=1.00, s2=0.20, ji=0.20] : FEEF
2021-04-20 01:33:04.574 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190905183858/https://

2021-04-20 01:33:10.710 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181116041548/https://expresso.sapo.pt/internacional/2018-11-12-Brexit.-Negociacoes-pela-noite-dentro-para-conseguir-acordo-garantiu-Theresa-May
2021-04-20 01:33:10.991 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181119193047/https://expresso.sapo.pt/internacional/2018-11-16-Volkswagen-vai-investir-mais-de-4000-milhoes-em-fabrica-na-China
2021-04-20 01:33:11.288 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181118190218/https://expresso.sapo.pt/internacional/2018-11-16-Volkswagen-vai-investir-mais-de-4000-milhoes-em-fabrica-na-China
2021-04-20 01:33:11.518 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190610181142/https://expresso.pt/politica/2019-06-10-Marcelo-focado-em-travara-descrenca-nacionalSomos-muito-mais-do-que-fragilidades-e-erros
2021-0

2021-04-20 01:33:17.323 | INFO     | utils:is_duplicate:198 - DUP [Governo, Governo português, s1=1.00, s2=0.50, ji=0.50] : Governo
2021-04-20 01:33:17.328 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181212203249/https://expresso.sapo.pt/internacional/2018-12-06-Brexit.-May-recolhe-apoios-de-ultima-hora-para-unico-acordo-de-saida-possivel
2021-04-20 01:33:17.654 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181213200744/https://expresso.sapo.pt/internacional/2018-12-06-Brexit.-May-recolhe-apoios-de-ultima-hora-para-unico-acordo-de-saida-possivel
2021-04-20 01:33:17.902 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190318055310/https://expresso.pt/economia/2019-03-13-Sabia-que-era-mau-nao-tinha-ideia-que-fosse-tao-mau-diz-quem-acompanha-o-Novo-Banco
2021-04-20 01:33:18.305 | INFO     | utils:is_duplicate:198 - DUP [José Rodrigues de Jesus, Rodrigues de Jesus

2021-04-20 01:33:23.465 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191021184752/https://expresso.pt/economia/2019-10-21-Porsche-quer-SIVA-a-vender-30-mil-carros-por-ano-e-garante-postos-de-trabalho
2021-04-20 01:33:23.636 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181227105105/https://expresso.pt/economia/2018-11-09-Concursos-do-molhe-de-Leixoes-e-Terminal-do-Barreiro-lancados-ate-ao-fim-do-ano
2021-04-20 01:33:23.883 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191109200734/https://expresso.pt/web-summit/2019-11-08-Participantes-na-Web-Summit-gastaram-mais-de-64-milhoes-nos-quatro-dias-do-evento
2021-04-20 01:33:24.075 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191029210901/https://expresso.pt/economia/2019-10-28-Muita-prudencia-em-relacao-a-potencial-nova-droga-da-Biogen-para-Alzheimer-defende-associacao-

2021-04-20 01:33:29.241 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181216091722/https://expresso.pt/blogues/jose-soeiro/2018-10-19-Reformas-antecipadas-o-que-ja-mudou-o-que-se-acordou-e-o-que-devia-mudar
2021-04-20 01:33:29.496 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too long 10737
2021-04-20 01:33:29.498 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181108053102/https://expresso.sapo.pt/internacional/2018-10-18-Nacoes-Unidas.-Abrandamento-drastico-no-acesso-a-internet-a-nivel-mundial-atinge-sobretudo-pobres-e-mulheres
2021-04-20 01:33:29.827 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181019182910/https://expresso.sapo.pt/internacional/2018-10-18-Nacoes-Unidas.-Abrandamento-drastico-no-acesso-a-internet-a-nivel-mundial-atinge-sobretudo-pobres-e-mulheres
2021-04-20 01:33:30.034 | INFO     | __main__:<module>:5 - [PROCESS] 

2021-04-20 01:33:36.862 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too short [Disponível na capa da Revista E

Se ainda não tem acesso]
2021-04-20 01:33:36.865 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190711190204/https://expresso.pt/autores/2015-05-04-Virgilio-Azevedo
2021-04-20 01:33:37.145 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too short [Disponível na capa da Revista E

Se ainda não tem acesso]
2021-04-20 01:33:37.147 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181012113739/https://expresso.sapo.pt/autores/2015-05-03-Valdemar-Cruz
2021-04-20 01:33:37.561 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too short [Disponível na capa da Revista E

Se ainda não tem acesso]
2021-04-20 01:33:37.564 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181107184358/https://expre

2021-04-20 01:33:43.709 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too short [Disponível na capa da Revista E

Se ainda não tem acesso]
2021-04-20 01:33:43.711 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190406175214/https://expresso.pt/autores/2015-01-22-Martim-Silva-1
2021-04-20 01:33:43.917 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too short [Disponível na capa da Revista E

Se ainda não tem acesso]
2021-04-20 01:33:43.919 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181010172006/https://expresso.sapo.pt/autores/2015-05-03-Valdemar-Cruz
2021-04-20 01:33:44.324 | ERROR    | __main__:process_news_piece:76 - [Exception] News piece is too short [Disponível na capa da Revista E

Se ainda não tem acesso]
2021-04-20 01:33:44.326 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190724011252/https://express

2021-04-20 01:33:50.236 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190824192550/https://www.dn.pt/portugal/interior/amp/almaraz-europa-precisa-de-350-mil-milhoes-de-euros-para-desmantelar-91-centrais---carlos-zorrinho-8587912.html
2021-04-20 01:33:50.455 | INFO     | utils:is_duplicate:198 - DUP [Idanha-a-Nova, Câmara de Idanha-a-Nova, s1=1.00, s2=0.33, ji=0.33] : Câmara de Idanha-a-Nova
2021-04-20 01:33:50.461 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181208191145/https://www.dn.pt/mundo/interior/xi-jinping-a-china-vai-aderir-sempre-ao-principio-do-respeito-mutuo-10279949.html
2021-04-20 01:33:50.839 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181208191831/https://www.dn.pt/cultura/interior/thom-yorke-atua-a-solo-no-nos-alive-10290218.html
2021-04-20 01:33:51.227 | INFO     | utils:is_duplicate:198 - DUP [Thom Yorke, Yorke, s1=0.50, s2=1.00, ji=0.50

2021-04-20 01:33:58.731 | INFO     | utils:is_duplicate:198 - DUP [Assembleia Geral das Nações Unidas, Assembleia Geral, s1=0.40, s2=1.00, ji=0.40] : Assembleia Geral
2021-04-20 01:33:58.736 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181209041953/https://www.dn.pt/default.aspx?referrer=www.globalimagens.pt
2021-04-20 01:33:59.296 | INFO     | utils:is_duplicate:198 - DUP [Assembleia Geral das Nações Unidas, Assembleia Geral, s1=0.40, s2=1.00, ji=0.40] : Assembleia Geral
2021-04-20 01:33:59.302 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191231204142/https://visao.sapo.pt/fotografia/2018-04-24-world-press-photo-2018/
2021-04-20 01:33:59.494 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20181025224437/https://visao.sapo.pt/exame/2018-03-28-Banco-de-Portugal-ve-desemprego-abaixo-dos-6-em-2020
2021-04-20 01:33:59.787 | INFO     | __main__:<module>:5 - [PROCESS

2021-04-20 01:34:06.895 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191218020320/https://visao.sapo.pt/opiniao/2019-04-23-o-circunstancial-vs-o-essencial/
2021-04-20 01:34:07.149 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20190329051651/https://visao.sapo.pt/exame/2019-03-28-Portugal-volta-a-comprar-mais-do-que-vende-ao-exterior
2021-04-20 01:34:07.413 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191204191220/https://visao.sapo.pt/visaojunior/2019-11-08-o-que-vamos-comer-no-futuro/
2021-04-20 01:34:07.632 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191205082247/https://visao.sapo.pt/cultura/2019-03-04-morreu-o-vocalista-dos-the-prodigy/
2021-04-20 01:34:07.841 | INFO     | utils:is_duplicate:198 - DUP [Liam Howlett, Howlett, s1=0.50, s2=1.00, ji=0.50] : Howlett
2021-04-20 01:34:07.846 | INFO     | __main__:<mod

2021-04-20 01:34:13.312 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191106231228/http://visao.sapo.pt/exame/2019-10-24-Belga-Xior-compra-residencias-de-estudantes-a-U.hub-por-130-milhoes-de-euros
2021-04-20 01:34:13.535 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191205073308/https://visao.sapo.pt/sociedade/2019-06-09-pusemos-um-walkman-nas-maos-de-miudos-e-aconteceu-isto/
2021-04-20 01:34:13.742 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191129202651/https://visao.sapo.pt/visaobs/2019-06-05-poupar-na-conta-da-luz-com-estilo-1/
2021-04-20 01:34:13.950 | INFO     | utils:is_duplicate:198 - DUP [LG Electronics, LG, s1=0.50, s2=1.00, ji=0.50] : LG
2021-04-20 01:34:13.955 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191204194427/https://visao.sapo.pt/mundo/2019-03-27-Adiamento-do-Brexit-aprovado-mas-impasse-conti

2021-04-20 01:34:19.378 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191228071800/https://visao.sapo.pt/visaose7e/ver/2019-10-27-woody-allen-fora-de-horas-no-filme-um-dia-de-chuva-em-nova-iorque/
2021-04-20 01:34:19.623 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191205045904/https://visao.sapo.pt/visaose7e/sair/2019-10-25-este-fim-de-semana-ha-jardins-abertos-em-lisboa-e-as-visitas-sao-gratis/
2021-04-20 01:34:19.839 | INFO     | utils:is_duplicate:198 - DUP [Lisboa, Câmara Municipal de Lisboa, s1=1.00, s2=0.25, ji=0.25] : Lisboa
2021-04-20 01:34:19.844 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191205051750/https://visao.sapo.pt/visaose7e/livros-e-discos/2019-09-23-pixies-estao-finalmente-de-volta-com-o-album-beneath-the-eyrie/
2021-04-20 01:34:20.073 | INFO     | __main__:<module>:5 - [PROCESS] Article https://arquivo.pt/noFrame/replay/20191218045348




In [211]:
change_entities = {
    'PER': ('Carlos Carvalhal', 'Fábio Oliveira'),
    'LOC': ('Famalicão', 'FC Mourinha') # ner ... pls 
}


with open('processed.json') as json_file:
    processed = json.load(json_file)
    
def replace_text(text, new_text, initial_char, final_char):
    return text[:initial_char] + new_text + text[final_char:]

def replace_appearances(text, entities, label, entity_before, entity_after):
    if label in entities.keys() and entity_before in entities[label].keys():
        appearances = entities[label][entity_before]
        for initial_char, final_char in appearances:
            text = replace_text(text, entity_after, initial_char, final_char)
    return text
    
def mix_entities(text, entities, change_entities):
    for label, (entity_before, entity_after) in change_entities.items():
        replace_appearances(text, entities, label, entity_before, entity_after)
    return text

article = processed[223]
article['summary'] = mix_entities(article['summary'], article['summary_entities'], change_entities)
article['title'] = mix_entities(article['title'], article['title_entities'], change_entities)
print(article)

def default(o):
    if isinstance(o, (datetime)):
        return o.isoformat()
with open('article.json', 'w') as outfile:
    json.dump(article, outfile, default=default)


{'title': 'Famalicão soma nova vitória e lidera à condição', 'original_title': 'Famalicão soma nova vitória e lidera à condição | Futebol | PÚBLICO', 'original_url': 'http://publico.pt/2019/08/16/desporto/noticia/famalicao-soma-nova-vitoria-lidera-condicao-1883636', 'url': 'https://arquivo.pt/noFrame/replay/20190831132137/http://publico.pt/2019/08/16/desporto/noticia/famalicao-soma-nova-vitoria-lidera-condicao-1883636', 'timestamp': '2019-08-31T13:21:37', 'website': 'publico.pt', 'text': 'Uma bomba do defesa Patrick William, na cobrança de um livre directo (66’), garantiu esta sexta-feira, ante o Rio Ave, a segunda vitória (1-0) do Famalicão e a liderança provisória do campeonato, com seis pontos em dois jogos e sem qualquer golo sofrido. No regresso a casa, na condição de primodivisionário, 25 anos depois do último encontro disputado entre a elite, os famalicenses superaram o Rio Ave de Carlos Carvalhal, que fez a estreia na prova.\n\nO equilíbrio imperou numa noite marcada por alguma

In [115]:
def get_most_frequent_entity_label(processed_list, label, topn=None):
    all_ents = itertools.chain(*[p['summary_entities'][label].keys() for p in processed_list if p['valid'] and label in p['summary_entities'].keys()])
    return Counter(all_ents).most_common(topn)

def get_most_frequent_enteties(processed_list, topn=None):
    most_frequent_entities = dict()
    for label in ['PER', 'LOC', 'ORG', 'MISC']:
        most_frequent_entities[label] = get_most_frequent_entity_label(processed_list, label, topn=10)
    return most_frequent_entities

def get_weighted_random_entity(label, most_frequent_entities):
    return random.choices(*zip(*most_frequent_entities[label]))[0]

most_frequent_enteties = get_most_frequent_enteties(processed, 10)

for label in ['PER', 'LOC', 'ORG', 'MISC']:
    print(label, ': ', get_weighted_random_entity(label, most_frequent_enteties), '\n')

PER :  António Costa 

LOC :  Europa 

ORG :  Lusa 

MISC :  Subscreva 



In [230]:
keys = ["title", "original_title", "url", "original_url", "image", "timestamp", "summary", "fake_details"]
fake = []

for p in processed[:5]:
    if not p['valid']: continue
    
    try:
        # Order by number of occurrences  
        entity_count_per_label = { key: Counter(value.keys()) for key, value in p['summary_entities'].items() }
        
        entities_replaced = []
        for label in ['PER']: #, 'LOC', 'ORG', 'MISC']:
            if label not in entity_count_per_label.keys():
                raise Exception('Zero entities with label ' + label)
            chosen_entity = list(entity_count_per_label[label].keys())[0]
            new_entity = get_weighted_random_entity(label, most_frequent_enteties)
            entities_replaced.append({
                "original_entity_name": chosen_entity,
                "replaced_entity_name": new_entity
            })
            img_url = get_random_arquivo_image(new_entity, show=True)
            print(label, '::', chosen_entity, '=>', new_entity)
    #         entity_count = list(dict(entitie_count[label].most_common()).keys())
    #         print(entity_count)


        p["fake_details"] = {
            "fake_title":  replace_appearances(p['title'], p['title_entities'], 'PER', chosen_entity, new_entity),
            "fake_summary":  replace_appearances(p['summary'], p['summary_entities'], 'PER', chosen_entity, new_entity), 
            "fake_image_url": img_url,
            "entities_replaced": entities_replaced
        }
        p = dict((k, p[k]) for k in keys if k in p.keys())
        
    except Exception as e:
        print(e)
        p['fake_details'] = None
    
    fake.append(p)

2021-04-20 01:40:22.784 | INFO     | __main__:query_arquivo:37 - [FETCHING] https://arquivo.pt/imagesearch?q=%22%E2%80%98Brexit%E2%80%99%22&from=20181001000000&to=20210420000000


2021-04-20 01:40:22.920 | INFO     | __main__:query_arquivo:37 - [FETCHING] https://arquivo.pt/imagesearch?q=%22May%22&from=20181001000000&to=20210420000000


PER :: Elisa Soares => ‘Brexit’


2021-04-20 01:40:23.078 | INFO     | __main__:query_arquivo:37 - [FETCHING] https://arquivo.pt/imagesearch?q=%22Brexit%22&from=20181001000000&to=20210420000000


PER :: Rute Agulhas => May


2021-04-20 01:40:23.188 | INFO     | __main__:query_arquivo:37 - [FETCHING] https://arquivo.pt/imagesearch?q=%22M%C3%A1rio+Centeno%22&from=20181001000000&to=20210420000000


PER :: Frangus => Brexit


PER :: João Gomes Cravinho => Mário Centeno
Zero entities with label PER
