## Importações e configuração

In [2]:
# ==============================================================
# UE Scraper – Versão Notebook (SEM BD)
# ==============================================================

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
import time
import plotly.express as px
import plotly.io as pio
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

# Gráficos interativos na célula
pio.renderers.default = "notebook_connected"

HEADERS = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
}

# Selenium (headless)
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),
                          options=chrome_options)

print("Pronto para scraping da UE!")

Pronto para scraping da UE!


## Funções auxiliares

In [3]:
# ==============================
# FUNÇÕES AUXILIARES
# ==============================

def carregar_pagina_selenium(url):
    """Abre a página com Selenium e devolve o BeautifulSoup."""
    driver.get(url)
    time.sleep(2)                     # espera carregamento JS
    return BeautifulSoup(driver.page_source, "html.parser")


def extrair_paragrafos(soup):
    """Extrai <p> com pelo menos 12 palavras ou que terminam com ponto."""
    paragrafos = []
    for p in soup.find_all("p"):
        txt = p.get_text(strip=True)
        if txt and (len(txt.split()) >= 12 or txt.endswith(".")):
            paragrafos.append(txt)
    return paragrafos or ["NA"]

## Scraper principal (todas as páginas)

In [5]:
# ==============================
# SCRAPER UE – COM URL CORRIGIDA
# ==============================

def scrape_ue(max_page=5):
    print(f"Coletando páginas de {max_page} até 1 …")
    start = time.time()
    articles = []
    base = "https://european-union.europa.eu/news-and-events/news-and-stories_en?page="

    for pg in range(max_page, 0, -1):
        url = base + str(pg)
        print(f"\nPágina {pg}: {url}")

        try:
            r = requests.get(url, headers=HEADERS, timeout=20)
            r.raise_for_status()
            soup = BeautifulSoup(r.content, "html.parser")

            bloco = soup.find("div", class_="ecl-content-item-block")
            if not bloco:
                print("   Bloco não encontrado – fim.")
                break

            itens = bloco.find_all("article", class_="ecl-content-item")
            print(f"   {len(itens)} notícias.")

            for item in itens:
                titulo_tag = item.find("div", class_="ecl-content-block__title")
                titulo = titulo_tag.a.get_text(strip=True) if titulo_tag else "NA"

                # === CORREÇÃO DA URL ===
                href = titulo_tag.a["href"] if titulo_tag and titulo_tag.a else "NA"
                if href.startswith("/"):
                    link = "https://european-union.europa.eu" + href
                elif href.startswith("http"):
                    link = href
                else:
                    link = "https://european-union.europa.eu/" + href
                # === FIM DA CORREÇÃO ===

                data_iso = item.find("time")["datetime"] if item.find("time") else None
                data = "NA"
                if data_iso:
                    try:
                        data = datetime.fromisoformat(data_iso.split("T")[0]).strftime("%d/%m/%Y")
                    except:
                        data = data_iso.split("T")[0]

                horario = data_iso.split("T")[1].split("Z")[0] if data_iso and "T" in data_iso else "NA"

                cat_tag = item.find("li", class_="ecl-content-block__primary-meta-item")
                tags = [cat_tag.get_text(strip=True)] if cat_tag else []

                img_tag = item.find("img")
                imagem = img_tag["src"] if img_tag and img_tag.get("src") else "NA"
                imagens = [{"Imagem": imagem}] if imagem != "NA" else []

                parags = ["NA"]
                if link != "NA" and not link.endswith(('.pdf', '.doc')):
                    try:
                        soup_full = carregar_pagina_selenium(link)
                        parags = extrair_paragrafos(soup_full)
                    except Exception as e:
                        print(f"   Erro ao ler {link}: {e}")

                articles.append({
                    "titulo": titulo,
                    "data": data,
                    "horario": horario,
                    "link": link,
                    "tags": " | ".join(tags),
                    "imagem": imagem,
                    "paragrafos": " || ".join(parags[:5]) + ("..." if len(parags) > 5 else ""),
                    "source": "UE News"
                })

            time.sleep(1.0)

        except Exception as e:
            print(f"   Erro na página {pg}: {e}")
            break

    df = pd.DataFrame(articles)
    print(f"\n{len(df)} notícias coletadas em {time.time()-start:.1f}s")
    display(df.head(20).style.set_properties(**{'text-align': 'left', 'white-space': 'pre-wrap'}))
    return df

# trocar para numero necessario
df_ue = scrape_ue(max_page=5)

Coletando páginas de 5 até 1 …

Página 5: https://european-union.europa.eu/news-and-events/news-and-stories_en?page=5
   20 notícias.

Página 4: https://european-union.europa.eu/news-and-events/news-and-stories_en?page=4
   20 notícias.

Página 3: https://european-union.europa.eu/news-and-events/news-and-stories_en?page=3
   19 notícias.

Página 2: https://european-union.europa.eu/news-and-events/news-and-stories_en?page=2
   20 notícias.

Página 1: https://european-union.europa.eu/news-and-events/news-and-stories_en?page=1
   20 notícias.

99 notícias coletadas em 315.2s


Unnamed: 0,titulo,data,horario,link,tags,imagem,paragrafos,source
0,"Citizens see improvements in justice systems in the EU, finds report",01/07/2025,12:00:00,https://ec.europa.eu/commission/presscorner/detail/en/ip_25_1693,Press release,https://european-union.europa.eu/sites/default/files/styles/oe_theme_ratio_3_2_medium/avportal/P-049982/00-15.jpg?itok=5kc3iN08,"This site uses cookies. Visit ourcookies policy pageor click the link in any footer for more information and to change your preferences. || Today, the European Commission published the thirteenth edition of theEU Justice Scoreboard, an annual report providing comparative data on the efficiency, quality, and independence of the justice systems among EU Member States. It shows that citizens in most Member States perceive judicial independence as having improved or remained stable, compared to last year. || This year's Scoreboard also presents new indicators relevant for the single market, highlighting the essential role of efficient and independent justice systems in fostering a fair and competitive market environment. For example, it shows that companies in 16 Member States commend the autonomy of their national competition authorities. || The findings of this year's Scoreboard will feed into Commission's2025 Rule of Law Report. || Digitalisation continues to make significant strides: nine Member States allow for the digital submission of evidence in civil, commercial, administrative and criminal cases, a noticeable jump from six in 2024. 26 Member States allow to initiate proceedings or file a claim online in civil and commercial cases....",UE News
1,Denmark assumes Presidency of the Council of the European Union,01/07/2025,12:00:00,https://danish-presidency.consilium.europa.eu/,Supplementary information,https://european-union.europa.eu/sites/default/files/styles/oe_theme_ratio_3_2_medium/public/2025-06/EU_DK_2025_RGB.png?h=92788cd1&itok=c4lZ5OT4,We use cookies in order to ensure that you can get the best browsing experience possible on this website. Certain cookies are used to obtain aggregated statistics about website visits to help us constantly improve the site and better serve your needs. Other cookies are used to boost performance and guarantee the security of the website.Read more. || This is a machine translation provided by the European Commission’s eTranslation service to help you understand this page.Please read the conditions of use. || Denmark holds the EU Presidency at a defining moment in time. Working at full speed for a strong Europe in a changing world. || The Danish EU Presidency will work for a strong and resolute EU that takes responsibility for its own security and for strengthening its competitiveness. This calls for the EU to match words with action and deliver on the challenges it faces. The green transition is essential to building a more secure and competitive Europe. || The Danish EU Presidency has two overarching priorities: A secure Europe and a competitive and green Europe....,UE News
2,Europass: 20 years of helping people learn and work in Europe,01/07/2025,12:00:00,https://commission.europa.eu/news-and-media/news/europass-20-years-helping-people-learn-and-work-europe-2025-07-01_en,Press release,https://european-union.europa.eu/sites/default/files/styles/oe_theme_ratio_3_2_medium/public/2025-07/P061271-649476.jpg?h=ac778ff2&itok=Bd-LOtlb,"This site uses cookies. Visit ourcookies policy pageor click the link in any footer for more information and to change your preferences. || All official European Union website addresses are in theeuropa.eudomain. || For 20 years, Europass has been helping millions of people in Europe to plan their learning and career - empowering them to document their skills, enhancing their mobility and connecting them with education and job opportunities. That’s right,today we celebrate 20 years of Europass. || Created back in 2005, Europass helps with creating CVs, cover letters and in finding jobs and courses in the EU. It is a secure, online platform where you can record all your work, education and training experiences, helping you to plan for the next step in your career. || One of the great features of life in the EU is freedom to move, live and work in any EU country. This freedom not only makes people’s lives easier, it also means more opportunities, and Europass is a free set of online tools to help you more easily seize these opportunities....",UE News
3,Europeans consider climate change a priority and support renewable energy,30/06/2025,12:00:00,https://ec.europa.eu/commission/presscorner/detail/en/ip_25_1376,Press release,https://european-union.europa.eu/sites/default/files/styles/oe_theme_ratio_3_2_medium/public/2025-06/Climate%20change%20pic.jpg?h=adc48829&itok=E4NCLZZ-,"This site uses cookies. Visit ourcookies policy pageor click the link in any footer for more information and to change your preferences. || A large majority ofEuropeans believeclimate change is a serious problem(85%),according to a newEurobarometer surveypublished today. Among those surveyed,8 in 10(81%) support the EU-wide goal of reaching climate neutrality by 2050. From an economic perspective, more than three quarters (77%) of Europeans agree thatthe cost of damage due to climate change is much higher than the investment needed for a net-zero transition. || Most Europeans (85%) agree thattackling climate change should bea priority to improve public health and quality of life.Likewise, 83% of those surveyed agree thatpreparing better for the adverse impactsof climate change will improve the lives of EU citizens. European citizens also feel the impact of climate change in their daily lives. On average, almost4 in 10 Europeans (38%) feel personally exposed to environmental and climate-related risks and threats. In 8 Member States, more than half of those surveyed feel this way; mostly in Southern Europe, but also in Poland and Hungary. || Close to nine in ten Europeans (88%) think it isimportant that the EU take action toincrease renewable energy, and the same number (88%) believe that it is important for the EU to take action toimprove energy efficiency,for example by encouraging people to insulate their home, install solar panels or buy electric cars. Three quarters (75%) believe thatreducing fossil fuel imports will increase energy securityand benefit the EU economically. 77% of Europeansagree that acting on climate change will foster innovation. More than eight in ten Europeans (84%) agree more support should be given to European companies to compete in the global market for clean technologies, demonstrating public backing for the Clean Industrial Deal. || A large majority ofEU citizens are taking individual climate action(92%) and making sustainable choices in their daily lives. However, when asked who is best placed to tackle climate change, only 28% believe they are best placed to turn the tide through individual actions. Citizens identified national governments (66%), the EU (59%) and business and industry (58%) as best placed to tackle climate change. 44% saw regional and local government as well positioned to take climate action....",UE News
4,EU’s first net-positive emissions building to open in Spain,30/06/2025,12:00:00,https://joint-research-centre.ec.europa.eu/jrc-news-and-updates/spain-will-host-first-net-positive-emissions-eu-building-2025-06-30_en,Press release,https://european-union.europa.eu/sites/default/files/styles/oe_theme_ratio_3_2_medium/public/2025-06/new_european_bauhaus_JRC_seville_SH.jpg?h=79dbda5e&itok=o9ftbWVa,"This site uses cookies. Visit ourcookies policy pageor click the link in any footer for more information and to change your preferences. || All official European Union website addresses are in theeuropa.eudomain. || New JRC site in Seville will fully embody the New European Bauhaus principles. || The European Commission begins construction of its first net-positive energy building in Seville, Spain, this summer. A symbol of sustainability and innovation, the newSeville siteof the Commission’s Joint Research Centre (JRC) will fully embody theNew European Bauhausprinciples. || The project aims to go beyond carbon neutrality by offsetting CO₂ from the atmosphere, primarily through generating solar energy that far exceeds its own operational needs. It will be the first EU institutional building of this scale to achieve net-positive energy....",UE News
5,Flying the EU flag for 40 years!,27/06/2025,12:00:00,https://european-union.europa.eu/flying-eu-flag-40-years-2025-06-27_en,News article,https://european-union.europa.eu/sites/default/files/styles/oe_theme_ratio_3_2_medium/avportal/P-066773/00-18.jpg?itok=eFJCcLrl,"This site uses cookies. Visit ourcookies policy pageor click the link in any footer for more information and to change your preferences. || All official European Union website addresses are in theeuropa.eudomain. || The flag of the European Union is turning 40 this June. Its 12 gold stars on a blue background are instantly recognisable and synonymous with the European project that unites all Europeans. With time, it has also become a symbol of the EU’s ideals of unity, solidarity, and harmony among the peoples of Europe. || The flag was first used as the flag of theCouncil of Europein 1955. Following World War Two, the Council of Europe was looking for a flag that would give Europe a symbol with which its inhabitants could identify. It chose the design which best conveyed neutrality, timelessness, and simplicity. || Contrary to a common misconception, the number 12 does not represent the number of EU countries in our Union but rather are a symbol of perfection and stability, and the circle, a symbol of union. The fixed number means the flag remains unchanged regardless of the European Union’s growth....",UE News
6,EU leaders discuss how to strengthen the EU’s position on the global stage,27/06/2025,12:00:00,https://www.consilium.europa.eu/en/meetings/european-council/2025/06/26/,Press release,https://european-union.europa.eu/sites/default/files/styles/oe_theme_ratio_3_2_medium/avportal/P-067390/00-01.jpg?itok=Alobrqub,,UE News
7,European Accessibility Act enters into force,27/06/2025,12:00:00,https://accessible-eu-centre.ec.europa.eu/content-corner/news/european-accessibility-act-enters-force-2025-06-27_en,Press release,https://european-union.europa.eu/sites/default/files/styles/oe_theme_ratio_3_2_medium/public/2025-06/European-Accessibility-Act.jpg?h=ef1d7280&itok=D1lHj5kB,"This site uses cookies. Visit ourcookies policy pageor click the link in any footer for more information and to change your preferences. || All official European Union website addresses are in theeuropa.eudomain. || The emergency number 112, banks, public transport and other products and services must comply with accessibility requirements. || The entry into force of theEuropean Accessibility Act (EU Directive 2019/882)marks an important step towards achieving a truly accessible Europe. As of 28 June, the emergency number 112, banks, public transport and other products and services must be accessible to the more than 440 million European citizens, especially the 100 million people with disabilities living in the EU. || The 112 number should enable people with communication difficulties to access emergency services using voice, text or video in real timeand from anywhere in Europe....",UE News
8,"Global leaders unite in support of immunisation, health security and prosperity",26/06/2025,12:00:00,https://commission.europa.eu/topics/public-health/advancing-global-health-gavi-pledging-summit_en,Supplementary information,https://european-union.europa.eu/sites/default/files/styles/oe_theme_ratio_3_2_medium/public/2025-06/Vaccination%20pledging%20summit%20GAVI%20gettyimages-156731127-170667a.jpg?h=5af4d83d&itok=DtcmtYdw,"This site uses cookies. Visit ourcookies policy pageor click the link in any footer for more information and to change your preferences. || All official European Union website addresses are in theeuropa.eudomain. || The impact of immunisation goes beyond health – it is about prosperity, security, and economic development. Ensuring equitable access to vaccines is a global priority, and the European Union (EU) plays a leading role in this effort. || In 2003, the EU first partnered up with Gavi, the Vaccine Alliance, to strengthen global health systems and has continued its successful collaboration ever since. || On 25 June 2025 in Brussels, the EU co-hosted - with the Gates Foundation - the Gavi 6.0 High-Level Pledging Summit, with the close support of other Gavi donors and implementing countries. The summit brought together a record number of global leaders from government, partner organisations, vaccine manufacturers, civil society, and the private sector, to secure crucial investments in vaccination programmes....",UE News
9,European rural areas face higher levels of energy poverty,26/06/2025,12:00:00,https://joint-research-centre.ec.europa.eu/jrc-news-and-updates/european-rural-areas-face-higher-levels-energy-poverty-2025-06-26_en,Press release,https://european-union.europa.eu/sites/default/files/styles/oe_theme_ratio_3_2_medium/avportal/P-062237/00-07.jpg?itok=6JVjo1d2,"This site uses cookies. Visit ourcookies policy pageor click the link in any footer for more information and to change your preferences. || All official European Union website addresses are in theeuropa.eudomain. || Around 48 million Europeans are unable to keep their homes warm, with rural households dedicating around 7% of their spending to energy. These are some of the signs of energy poverty, but we need to better understand all the factors to tackle it. || Being energy poor, for a household, means not being able to access the energy they need for essential services like heating and cooling. This has a serious impact on inhabitants’ health and wellbeing, especially in hot summers and cold winters. || EU rural areas are particularly affected due to lower average incomes, coupled with homes that are generally bigger, older, and less energy efficient than buildings in cities, towns and suburbs. However, rural areas are ahead of urban areas and cities in carrying out energy efficiency improvements. They are particularly suited to installing renewable energy systems, thanks to more available land and the high share of buildings owned by their occupants. In rural areas, rooftop photovoltaics (solar panels) could potentially produce 2 200kWh per inhabitant, enough to cover more than a third of average household energy needs each year....",UE News


## Filtro + Gráficos interativos

In [9]:
# ==============================
# FILTRO & GRÁFICOS (100% FUNCIONAL)
# ==============================

keywords = ['digital', 'governança', 'internet', 'IA', 'privacidade', 'regulação']
pattern = r'|'.join(keywords)

df_filt = df_ue[
    df_ue['titulo'].str.contains(pattern, case=False, na=False, regex=True) |
    df_ue['paragrafos'].str.contains(pattern, case=False, na=False, regex=True)
].copy()

print(f"\n{len(df_filt)} notícias filtradas (de {len(df_ue)})")

def plot_charts(df):
    if df.empty:
        print("Sem dados para gráficos.")
        return

    # 1. Top 15
    top15 = df.head(15).copy()
    top15['rank'] = range(1, len(top15)+1)
    fig1 = px.bar(top15, x='rank', y='titulo', orientation='h',
                  title='Top 15 Notícias')
    fig1.update_layout(height=600,
                       yaxis={'categoryorder':'array',
                              'categoryarray':top15['titulo'][::-1]})
    fig1.show()

    # 2. Pizza por tag — CORRIGIDO
    tag_series = df['tags'].str.split(r' \| ').explode()
    if not tag_series.empty:
        tag_counts = tag_series.value_counts().head(10).reset_index()
        tag_counts.columns = ['tag', 'count']  # ← nomes corretos
        fig2 = px.pie(tag_counts, names='tag', values='count', title='Distribuição de Tags')
        fig2.show()
    else:
        print("Nenhuma tag para exibir no gráfico de pizza.")

    # 3. Nuvem de palavras
    text = ' '.join(df['titulo'] + ' ' + df['paragrafos']).lower()
    words = re.findall(r'\b\w{4,}\b', text)
    if words:
        wc = pd.Series(words).value_counts().head(20).reset_index()
        wc.columns = ['palavra', 'freq']
        fig3 = px.treemap(wc, path=['palavra'], values='freq',
                          title='Nuvem de Palavras')
        fig3.show()
    else:
        print("Nenhuma palavra para nuvem.")

plot_charts(df_filt if not df_filt.empty else df_ue)


83 notícias filtradas (de 99)


## Esqueleto do banco (só referência)

In [None]:
# ==============================
# ESQUELETO DO BANCO (NÃO EXECUTADO)
# ==============================
"""
import sqlite3
DB = 'ue_news.db'
def init_db():
    conn = sqlite3.connect(DB)
    conn.execute('''
        CREATE TABLE IF NOT EXISTS news (
            id INTEGER PRIMARY KEY,
            titulo TEXT, data TEXT, horario TEXT, link TEXT UNIQUE,
            tags TEXT, imagem TEXT, paragrafos TEXT
        )
    ''')
    conn.close()
"""
