# Descarga de datos con APIs

In [1]:
# === 1. IMPORTS Y CONFIGURACIÓN ===

from newsapi import NewsApiClient
import requests
import feedparser
import pandas as pd
from datetime import datetime, timedelta
import time

# Claves API (reemplaza si cambian)
NEWSAPI_KEY = 'd7c0734c87d04ed69ea623c77e7406b3'
GNEWS_API_KEY = '8b50c1681d21553b25406f228aca6938'
CRYPTOPANIC_API_KEY = 'b4124ef89cfd6fdce685b3218b21a6378dc6ed01'  # Tu key real

# Inicializar NewsAPI
newsapi = NewsApiClient(api_key=NEWSAPI_KEY)

# RSS feeds ampliados (más fuentes = más data gratis)
RSS_FEEDS = {
    'BTC': [
        "https://feeds.feedburner.com/CoinDesk",
        "https://cointelegraph.com/rss",
        "https://www.newsbtc.com/feed/",
        "https://bitcoinmagazine.com/.rss/full/"
    ],
    'TECH': [
        "https://techcrunch.com/feed/",
        "https://www.theverge.com/rss/index.xml"
    ],
    'MACRO': [
        "https://feeds.bbci.co.uk/news/business/rss.xml",
        "https://www.cnbc.com/id/100003114/device/rss/rss.html",  # CNBC World
        "https://www.ft.com/global-economy?format=rss"
    ]
}

print("✅ Configuración lista. APIs y feeds definidos.")

✅ Configuración lista. APIs y feeds definidos.


In [2]:
# === 2. DEFINICIÓN DE PALABRAS CLAVE POR EJE ===

KEYWORDS_BTC = (
    "(bitcoin OR btc OR cryptocurrency OR 'crypto price' OR 'digital gold' OR halving) "
    "NOT (fraud OR scam OR hack OR theft)"
)

KEYWORDS_TECH = (
    "(blockchain OR web3 OR 'RAM memory' OR 'semiconductor shortage' OR 'AI investment' OR 'DeFi' OR 'mining rig')"
)

KEYWORDS_MACRO = (
    "(FED OR 'interest rate' OR inflation OR recession OR 'treasury bond' OR 'stock market crash' OR 'quantitative easing')"
)

keyword_map = {
    'BTC': KEYWORDS_BTC,
    'TECH': KEYWORDS_TECH,
    'MACRO': KEYWORDS_MACRO
}

# Rango de fechas (cámbialo fácil)
FECHA_INICIO = '2025-12-15'
FECHA_FIN = '2025-12-17'

print("✅ Palabras clave definidas para los 3 ejes.")

✅ Palabras clave definidas para los 3 ejes.


In [3]:
# === 3. FUNCIONES PARA CADA FUENTE (CON DEBUG) ===

def fetch_newsapi(query, from_date, to_date, axis):
    articles = []
    try:
        response = newsapi.get_everything(
            q=query,
            language='en',
            from_param=from_date,
            to=to_date,
            sort_by='relevancy',
            page_size=100
        )
        if response['status'] == 'ok':
            for article in response['articles']:
                articles.append({
                    'publishedAt': article['publishedAt'],
                    'title': article['title'],
                    'description': article['description'],
                    'source': article['source']['name'],
                    'axis': axis
                })
    except Exception as e:
        print(f"❌ NewsAPI error ({axis}): {e}")
    return articles

def fetch_gnews(query, from_date, to_date, axis):
    articles = []
    params = {
        "q": query,
        "from": from_date,
        "to": to_date,
        "lang": "en",
        "max": 100,
        "token": GNEWS_API_KEY
    }
    try:
        response = requests.get("https://gnews.io/api/v4/search", params=params, timeout=15)
        data = response.json()
        print(f"Debug GNews ({axis}): Status {response.status_code}")
        if 'information' in data:
            print(f"  Aviso GNews: {data['information']}")
        raw_articles = data.get('articles', [])
        for article in raw_articles:
            articles.append({
                'publishedAt': article['publishedAt'],
                'title': article['title'],
                'description': article.get('description'),
                'source': article['source']['name'],
                'axis': axis
            })
        print(f"  - {axis} GNews: {len(articles)} artículos (con delay 12h en free)")
    except Exception as e:
        print(f"❌ GNews error ({axis}): {e}")
    return articles

def fetch_cryptopanic():
    articles = []
    params = {
        "auth_token": CRYPTOPANIC_API_KEY,
        "public": "true",
        "kind": "news"
    }
    try:
        response = requests.get("https://cryptopanic.com/api/v1/posts/", params=params, timeout=15)
        print(f"Debug CryptoPanic: Status {response.status_code}")
        print(f"  Respuesta cruda (primeros 200 chars): {response.text[:200]}")
        data = response.json()
        for post in data.get("results", []):
            articles.append({
                'publishedAt': post['published_at'],
                'title': post['title'],
                'description': None,
                'source': post.get('source', {}).get('title', 'CryptoPanic'),
                'axis': 'BTC'
            })
    except Exception as e:
        print(f"❌ CryptoPanic error detallado: {e}")
    return articles

def fetch_rss(axis):
    articles = []
    for feed_url in RSS_FEEDS.get(axis, []):
        feed = feedparser.parse(feed_url)
        print(f"Debug RSS ({axis}, {feed_url}): {len(feed.entries)} entradas")
        for entry in feed.entries:
            articles.append({
                'publishedAt': entry.get('published'),
                'title': entry.title,
                'description': entry.get('summary'),
                'source': feed.feed.get('title', 'RSS'),
                'axis': axis
            })
    return articles

print("✅ Funciones de fetch listas (con debug).")

✅ Funciones de fetch listas (con debug).


In [4]:
# === 4. BUCLE PRINCIPAL DE EXTRACCIÓN ===

start_date = datetime.strptime(FECHA_INICIO, '%Y-%m-%d')
end_date = datetime.strptime(FECHA_FIN, '%Y-%m-%d') + timedelta(days=1)

all_news_data = {'BTC': [], 'TECH': [], 'MACRO': []}

# CryptoPanic y RSS una vez
print("Buscando CryptoPanic...")
all_news_data['BTC'].extend(fetch_cryptopanic())

print("Buscando RSS feeds...")
for axis in ['BTC', 'TECH', 'MACRO']:
    rss_articles = fetch_rss(axis)
    all_news_data[axis].extend(rss_articles)

current_date = start_date
print(f"--- INICIANDO EXTRACCIÓN DÍA A DÍA ---\nRANGO: {FECHA_INICIO} a {FECHA_FIN}")

while current_date < end_date:
    day_str = current_date.strftime('%Y-%m-%d')
    print(f"\nProcesando día: {day_str}")
    
    for axis_name, keyword_query in keyword_map.items():
        newsapi_articles = fetch_newsapi(keyword_query, day_str, day_str, axis_name)
        all_news_data[axis_name].extend(newsapi_articles)
        print(f"  - {axis_name} NewsAPI: {len(newsapi_articles)} artículos")
        
        gnews_articles = fetch_gnews(keyword_query, day_str, day_str, axis_name)
        all_news_data[axis_name].extend(gnews_articles)
        
        time.sleep(0.6)  # Pausa segura
    
    current_date += timedelta(days=1)

print("\n--- EXTRACCIÓN FINALIZADA ---")
for axis, data in all_news_data.items():
    print(f"Total {axis}: {len(data)} artículos")

Buscando CryptoPanic...
Debug CryptoPanic: Status 404
  Respuesta cruda (primeros 200 chars): <!DOCTYPE html>
<html lang="en">
<head>
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <meta name="viewport" cont
❌ CryptoPanic error detallado: Expecting value: line 1 column 1 (char 0)
Buscando RSS feeds...
Debug RSS (BTC, https://feeds.feedburner.com/CoinDesk): 25 entradas
Debug RSS (BTC, https://cointelegraph.com/rss): 30 entradas
Debug RSS (BTC, https://www.newsbtc.com/feed/): 10 entradas
Debug RSS (BTC, https://bitcoinmagazine.com/.rss/full/): 10 entradas
Debug RSS (TECH, https://techcrunch.com/feed/): 20 entradas
Debug RSS (TECH, https://www.theverge.com/rss/index.xml): 10 entradas
Debug RSS (MACRO, https://feeds.bbci.co.uk/news/business/rss.xml): 53 entradas
Debug RSS (MACRO, https://www.cnbc.com/id/100003114/device/rss/rss.html): 30 entradas
Debug RSS (MACRO, https://www.ft.com/global-economy?format

In [None]:
# === 5. CONSOLIDAR Y GUARDAR CSV ===

full_articles = all_news_data['BTC'] + all_news_data['TECH'] + all_news_data['MACRO']
df_raw_news = pd.DataFrame(full_articles)

fecha_actual = datetime.now().strftime('%Y.%m.%d')
filename = f'{fecha_actual}.noticias_raw_sentimiento.csv'

df_raw_news.to_csv(filename, index=False)

print(f"\n✅ Datos guardados en '{filename}'")
print("Total artículos:", len(df_raw_news))
print("\nEjemplo:")
display(df_raw_news.head(10))