# News Scraper - InfoMoney (Batch Processing)

This notebook uses the `src.scraper` module to collect news from InfoMoney via API.
It iterates over a list of search terms, saves individual CSV files, and then consolidates them into a single dataset.

In [1]:
import sys
import os
import glob
from datetime import datetime
import pandas as pd

# Fix para erro de Unicode no Windows
os.environ["PYTHONUTF8"] = "1"

# Auto-reload para refletir mudanças nos scripts
%load_ext autoreload
%autoreload 2

# Add root directory to path to import src modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.scraper import get_news_from_period
from src.utils.logger import setup_logger

# Configure logger to see output in the notebook
logger = setup_logger()

## Parameter Definition
Define the list of search terms, the start date for collection, and the output folder.

In [None]:
SEARCH_TERMS = ["Itaú", "Dólar", "Petrobras", "Vale", "Bolsa de Valores", "Bolsa", "B3", "IBOVESPA", "Bradesco", "Economia"]
START_DATE = datetime(2025, 1, 1)
END_DATE = datetime(2025, 12, 31)
OUTPUT_DIR = os.path.join("..", "src", "dataset", "scraper", "search")

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Configured to search {len(SEARCH_TERMS)} terms between {START_DATE.date()} and {END_DATE.date()}")
print(f"Results will be saved to: {os.path.abspath(OUTPUT_DIR)}")


## Batch Collection Execution
Iterates over terms, collects data, and saves to CSV in the results folder.

In [None]:
for term in SEARCH_TERMS:
    print(f"\n{'='*50}")
    print(f"Processing term: '{term}'")
    print(f"{'='*50}")
    
    try:
        # Execute scraper for the current term
        df = get_news_from_period(term=term, start_date=START_DATE, end_date=END_DATE)
        
        if not df.empty:
            # Generate filename
            safe_term = term.replace(" ", "_").lower()
            timestamp = datetime.now().strftime('%Y%m%d')
            filename = f"news_{safe_term}_{timestamp}.csv"
            filepath = os.path.join(OUTPUT_DIR, filename)
            
            # Save to CSV
            df.to_csv(filepath, index=False)
            print(f"\n[SUCCESS] Saved {len(df)} news for '{term}' to: {filepath}")
            
            # Optional: Display first few rows
            display(df.head(3))
        else:
            print(f"\n[WARNING] No news found for '{term}'")
            
    except Exception as e:
        print(f"\n[ERROR] Failed to collect term '{term}': {e}")


## Consolidate Results
Reads all CSV files from the results folder, combines them into a single dataset, and removes duplicates (based on link).

In [None]:
# --- 1. Collection Execution (if needed) ---
# Uncomment lines below to collect new data from the API
# for term in SEARCH_TERMS:
#    print(f"Processing term: {term}")
#    get_news_from_period(term, START_DATE, END_DATE)

# --- 2. Consolidation and Filtering ---
print("\n" + "="*50 + "\nStarting Consolidation and Filtering...\n" + "="*50)

# List of keywords for filtering
KEYWORDS = [
    "Ibovespa", "BOVA11", "Bolsa", "Ações", "Mercado", "Câmbio", "Dólar", "Juros", "Selic", "Inflação",
    "IPCA", "Banco Central", "Copom", "Fazenda", "CDI", "Petrobras", "Vale", "Itaú", "Bradesco", "Banco do Brasil",
    "B3", "Ambev", "Eletrobras", "WEG", "Suzano", "Gerdau", "Localiza", "Rumo", "Equatorial", "BTG Pactual"
]

all_files = glob.glob(os.path.join(OUTPUT_DIR, "news_*.csv"))

if all_files:
    print(f"Found {len(all_files)} files to consolidate.")
    
    # Read and concatenate all files
    df_list = [pd.read_csv(f) for f in all_files]
    consolidated_df = pd.concat(df_list, ignore_index=True)
    
    total_rows = len(consolidated_df)
    print(f"Total rows before deduplication and filtering: {total_rows}")
    
    # Drop duplicates based on 'link'
    consolidated_df = consolidated_df.drop_duplicates(subset=['link'])
    unique_rows = len(consolidated_df)
    print(f"Total unique after deduplication: {unique_rows}")
    
    # --- FILTERING ---
    print(f"\nApplying filter with {len(KEYWORDS)} keywords...")
    # Added \b to ensure exact word match (avoids partial matches like 'privatiz-ações')
    pattern = '|'.join([fr'\b{k}\b' for k in KEYWORDS])
    # Convert to string and handle NaN before filtering
    mask = consolidated_df['title'].astype(str).str.contains(pattern, case=False, na=False)
    filtered_df = consolidated_df[mask].copy()
    
    print(f"Total news after filtering: {len(filtered_df)}")
    print(f"News removed: {unique_rows - len(filtered_df)}")

    # Sort by date
    if 'date' in filtered_df.columns:
        filtered_df = filtered_df.sort_values(by='date', ascending=False)
    
    # Save consolidated file
    timestamp = datetime.now().strftime('%Y%m%d')
    # Save to parent directory of OUTPUT_DIR (dataset/infomoney)
    parent_dir = os.path.dirname(OUTPUT_DIR)
    output_file = os.path.join(parent_dir, f"consolidated_news_{timestamp}.csv")
    filtered_df.to_csv(output_file, index=False)
    
    print(f"\n[SUCCESS] Consolidated and filtered file saved to: {output_file}")
    display(filtered_df.head())
else:
    print("No CSV files found in the results folder to consolidate.")