In [57]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import requests, datetime
from datetime import date, timedelta
from utils import *
from newspaper import Article
from IPython.display import clear_output
from urllib.parse import urlencode, quote_plus
from tqdm.notebook import tqdm
import random
from IPython.display import Image


from loguru import logger
logger.add("process.log")

import pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
@logger.catch
def search_arquivo_news(query: str, _from: datetime, _to: datetime, websites: [str], max_items=2000, _type="html", fields="title,tstamp,originalURL,linkToNoFrame"):
    params = {
        "q": '"%s"' % query,
        "from": arquivo_date(_from),
        "to": arquivo_date(_to),
        "siteSearch": ",".join(websites),
        "fields": fields,
        "type": _type,
        "maxItems": max_items,
        "itemsPerSite": max_items//len(websites),
    }
    
    # itemsPerSite does not work
    return query_arquivo(params, "https://arquivo.pt/textsearch", attempts=10)

@logger.catch
def get_random_arquivo_image(query: str, _from: datetime = datetime(2018,10,1), _to: datetime = date.today(), show=False):
    params = {
        "q": '"%s"' % query,
        "from": arquivo_date(_from),
        "to": arquivo_date(_to)
    }
    
    response = query_arquivo(params, "https://arquivo.pt/imagesearch", attempts=10)
    chosen_index = random.randint(0, len(response) - 1)
    random_image = response[chosen_index]['imgLinkToArchive']
    
    if show: display(Image(url=random_image, width=300))
    return random_image


@logger.catch
def query_arquivo(params, endpoint, timeout=30, attempts=1):
    logger.info("[FETCHING] %s" % endpoint + '?' + urlencode(params, quote_via=quote_plus))
    
    r = try_request(endpoint, params, timeout, attempts)
    if not r: return []
    json_response = r.json()
    return r.json()["response_items"] if "response_items" in json_response else r.json()["responseItems"]

@logger.catch
def process_news_piece(arquivo_item):
    
    # Clean news piece (returns None if some scrapping error occured)
    processing_article = clean_response_item(arquivo_item.copy())
    if processing_article is None:
        return {"valid": False}
    
    # Check if the arquivo link is up 
    html = try_request(processing_article["url"])
    if not html:
        if html == False:  # resource will never be available
            processing_article["valid"] = False
            logger.error("%s will never be available" % (processing_article["url"]))
        return
    
    try:
        # Process using Newspaper3k Package
        article_obj = Article(processing_article["url"], _language="pt")
        article_obj.download(input_html=html.text)
        article_obj.parse()
        article_obj.nlp()
        
        processing_article["text"] = assert_valid_article(article_obj)
        processing_article["image"] = article_obj.top_image
        processing_article["summary"] = article_obj.summary
        processing_article["summary_entities"], duplicates = organize_entities(processing_article["text"])
        processing_article["title_entities"], _ = organize_entities(processing_article["title"], duplicates)
        processing_article["valid"] = True
        
    except Exception as e:
#         print(e.with_traceback())
        logger.error("[Exception] " + str(e))
        processing_article["valid"] = False
        
    return processing_article

# process_news_piece(news[223])

In [39]:

arquivo_response = []

# In total 588 Publico news articles.
arquivo_response += search_arquivo_news('', datetime(2018,10,1), date.today(), ['publico.pt noticia -js ', 'acervo.publico.pt noticia -js ']) 

# In total 553 Expresso news articles.
arquivo_response += search_arquivo_news('', datetime(2018,10,1), date.today(), ['expresso.pt 2018', 'expresso.pt 2019', 'expresso.pt 2020', 'expresso.sapo.pt 2018', 'expresso.sapo.pt 2019', 'expresso.sapo.pt 2020'])

# In total 37 DN news articles.
arquivo_response += search_arquivo_news('', datetime(2018,10,1), date.today(), ['www.dn.pt politica', 'www.dn.pt sociedade', 'dn.sapo.pt politica', 'dn.sapo.pt sociedade'])

# In total 101 DN news articles.
arquivo_response += search_arquivo_news('', datetime(2018,10,1), date.today(), ['visao.sapo.pt 2018', 'visao.sapo.pt 2019', 'visao.sapo.pt 2020'])

print("In total", len(arquivo_response), "arquivo items. The first 5 items are:\n")
print(arquivo_response[:5])

2021-04-19 12:21:59.575 | DEBUG    | __main__:query_news:20 - [FETCHING] https://arquivo.pt/textsearch?q=%22%22&from=20181001000000&to=20210419000000&siteSearch=publico.pt+noticia+-js+%2Cacervo.publico.pt+noticia+-js+&fields=title%2Ctstamp%2CoriginalURL%2ClinkToNoFrame&type=html&maxItems=2000&itemsPerSite=1000
2021-04-19 12:22:02.054 | DEBUG    | __main__:query_news:20 - [FETCHING] https://arquivo.pt/textsearch?q=%22%22&from=20181001000000&to=20210419000000&siteSearch=expresso.pt+2018%2Cexpresso.pt+2019%2Cexpresso.pt+2020%2Cexpresso.sapo.pt+2018%2Cexpresso.sapo.pt+2019%2Cexpresso.sapo.pt+2020&fields=title%2Ctstamp%2CoriginalURL%2ClinkToNoFrame&type=html&maxItems=2000&itemsPerSite=333
2021-04-19 12:22:32.130 | ERROR    | utils:try_request:34 - [HTTPSConnectionPool(host='arquivo.pt', port=443): Read timed out. (read timeout=30)] for [{'q': '""', 'from': '20181001000000', 'to': '20210419000000', 'siteSearch': 'expresso.pt 2018,expresso.pt 2019,expresso.pt 2020,expresso.sapo.pt 2018,expres

In total 1279 arquivo items. The first 5 items are:

[{'title': 'Praga de baratas “grandes” aflige Parque das Nações | Lisboa | PÚBLICO', 'originalURL': 'http://publico.pt/2019/08/29/local/noticia/baratas-parque-nacoes-1884816', 'tstamp': '20190831150457', 'linkToNoFrame': 'https://arquivo.pt/noFrame/replay/20190831150457/http://publico.pt/2019/08/29/local/noticia/baratas-parque-nacoes-1884816'}, {'title': 'Mais de 50 crianças adoptadas foram devolvidas nos últimos três anos | Adopção | PÚBLICO', 'originalURL': 'http://publico.pt/2019/07/30/sociedade/noticia/ultimos-tres-anos-devolvidas-53-criancas-adopcao-1881728', 'tstamp': '20190831150900', 'linkToNoFrame': 'https://arquivo.pt/noFrame/replay/20190831150900/http://publico.pt/2019/07/30/sociedade/noticia/ultimos-tres-anos-devolvidas-53-criancas-adopcao-1881728'}, {'title': 'Empresas de Portugal e Galiza suspeitas de se apropriarem de 20 milhões de fundos europeus | Justiça', 'originalURL': 'http://publico.pt/2019/08/03/sociedade/notic

In [41]:
processed = []

logger.info("[START PROCESSING]")
for arquivo_item in tqdm(arquivo_response):
    logger.info("[PROCESS] Article %s" % arquivo_item['linkToNoFrame'])
    processed.append(process_news_piece(arquivo_item))
    
# Pickle processed news
with open('processed.pkl', 'wb') as f:
    pickle.dump(processed, f)
    
def default(o):
    if isinstance(o, (datetime)):
        return o.isoformat()
    
# Also output it to a JSON file
with open('processed.json', 'w') as outfile:
    json.dump(processed, outfile, default=default)