<a href="https://colab.research.google.com/github/anamacao/FAPESP-PIBIC-scrapping/blob/main/camaraFederal_ipynb_Ana.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## camaraFederal.ipynb

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime
import sqlite3

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [2]:
# %%
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

BASE_URL = "https://www.camara.leg.br/noticias/noticias-institucionais"
print("‚úÖ Scraper da C√¢mara (Institucionais) pronto!")

‚úÖ Scraper da C√¢mara (Institucionais) pronto!


In [3]:
DATABASE_NAME = "internet_governance_news.db"

def create_database():
    conn = sqlite3.connect(DATABASE_NAME)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS articles (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT,
            date TEXT,
            author TEXT,
            url TEXT UNIQUE,
            source TEXT
        )
    """)
    conn.commit()
    conn.close()
    print("‚úÖ Banco e tabela 'articles' prontos!")

create_database()

‚úÖ Banco e tabela 'articles' prontos!


‚úÖ Banco e tabela 'articles' prontos!


In [None]:
def insert_article(title, date, author, url, source):
    conn = sqlite3.connect(DATABASE_NAME)
    cursor = conn.cursor()
    try:
        cursor.execute("""
            INSERT INTO articles (title, date, author, url, source)
            VALUES (?, ?, ?, ?, ?)
        """, (title, date, author, url, source))
        conn.commit()
        return True
    except sqlite3.IntegrityError:
        return False
    finally:
        conn.close()

In [None]:
# %%
def load_articles_from_db():
    conn = sqlite3.connect(DATABASE_NAME)
    df = pd.read_sql("""
        SELECT *
        FROM articles
        ORDER BY date DESC
    """, conn)
    conn.close()
    return df

df_db = load_articles_from_db()
display(df_db.head())
print(f"üì¶ Total no banco: {len(df_db)} registros")

In [None]:
# %%
noticias = []
TOTAL_PAGES = 78

for pagina in range(TOTAL_PAGES, 0, -1):
    url = montar_url(pagina)
    print(f"üìÑ Coletando p√°gina {pagina}: {url}")

    r = requests.get(url, headers=HEADERS, timeout=10)
    if r.status_code != 200:
        print("‚ö†Ô∏è Erro ao acessar p√°gina")
        continue

    soup = BeautifulSoup(r.text, "html.parser")

    # ===== Seletores adaptados para NOT√çCIAS INSTITUCIONAIS
    itens = soup.select("li.l-lista-noticias__item")
    print(f"   {len(itens)} not√≠cias encontradas")

    for item in itens:
        artigo = item.select_one("article.g-chamada")
        if not artigo:
            continue

        # t√≠tulo & link
        titulo_tag = artigo.select_one(".g-chamada__titulo a")
        if not titulo_tag:
            continue

        titulo = titulo_tag.get_text(strip=True)
        link = titulo_tag["href"]

        # data e hora
        date_tag = artigo.select_one(".g-artigo__data-hora")
        data_raw = date_tag.get_text(strip=True) if date_tag else "NA"

        # extrai texto
        paragrafos = extrair_paragrafos(link)

        # acumula
        noticias.append({
            "titulo": titulo,
            "data": data_raw,
            "link": link,
            "paragrafos": " || ".join(paragrafos),
            "fonte": "C√¢mara dos Deputados"
        })

        # grava no banco
        insert_article(
            title=titulo,
            date=data_raw,
            author="Ag√™ncia C√¢mara",
            url=link,
            source="C√¢mara dos Deputados"
        )

    time.sleep(1)

print(f"\n‚úÖ Total coletado: {len(noticias)} not√≠cias")

df_camara = pd.DataFrame(noticias)
display(df_camara.head())

In [None]:
def load_articles():
    conn = sqlite3.connect(DATABASE_NAME)
    df = pd.read_sql("""
        SELECT * FROM articles
        ORDER BY date DESC
    """, conn)
    conn.close()
    return df

df_db = load_articles()
print(f"üì¶ Total no banco: {len(df_db)} registros")
display(df_db.head(20))

In [None]:
keywords = ['digital', 'internet', 'IA', 'tecnologia', 'dados', 'privacidade']
pattern = r'|'.join(keywords)

df_filt = df_camara[
    df_camara['titulo'].str.contains(pattern, case=False, na=False, regex=True) |
    df_camara['paragrafos'].str.contains(pattern, case=False, na=False, regex=True)
].copy()

print(f"{len(df_filt)} not√≠cias filtradas (de {len(df_camara)})")
display(df_filt.head())

In [None]:
def plot_charts(df):
    if df.empty:
        print("‚ùå Sem dados para gr√°ficos")
        return

    # ------------------------------
    # Top 15
    # ------------------------------
    top15 = df.head(15).copy()
    top15['rank'] = range(1, len(top15) + 1)

    fig1 = px.bar(
        top15,
        x='rank',
        y='title',
        orientation='h',
        title='Top 15 Not√≠cias ‚Äì Internet Governance'
    )
    fig1.update_layout(height=600)
    fig1.show()

    # ------------------------------
    # Pizza por Fonte (BANCO)
    # ------------------------------
    # Fonte
    source_count = df["source"].value_counts().reset_index()
    source_count.columns = ["source", "count"]

    fig2 = px.pie(
        source_count,
        names="source",
        values="count",
        title="Distribui√ß√£o por Fonte"
    )
    fig2.show()

    # ------------------------------
    # Nuvem de Palavras
    # ------------------------------
    text = ' '.join(df['title'].astype(str)).lower()
    words = re.findall(r'\b\w{4,}\b', text)

    wc = (
        pd.Series(words)
        .value_counts()
        .head(20)
        .reset_index()
    )
    wc.columns = ['palavra', 'freq']

    fig3 = px.treemap(
        wc,
        path=['palavra'],
        values='freq',
        title='Nuvem de Palavras ‚Äì T√≠tulos'
    )
    fig3.show()

In [None]:
plot_charts(df_db)