# 📌 Scraping de Instagram - 8 Ciudades

Este notebook realiza scraping en **Instagram** usando Playwright para recorrer **8 ciudades** y varias **palabras clave**.

Incluye:
- Guardado de sesión en `instagram_sesion.json`.
- Manejo de errores 429 (Too Many Requests) con backoff exponencial.
- Scroll infinito para recolectar posts.
- Extracción de `texto` y `fecha`.
- Likes/DMs aleatorios.
- Limpieza de duplicados.
- Guardado en un único JSON estructurado por ciudad.

## 📦 Instalación

In [None]:
!pip install playwright
!playwright install





## 1️⃣ Importar librerías

In [None]:
import asyncio
from playwright.async_api import async_playwright
import json
import random
from datetime import datetime

## 2️⃣ Definir ciudades y palabras clave

In [None]:
ciudades = [
    'Tenerife',
    'Barcelona',
    'Madrid',
    'Malaga',
    'Gran Canaria',
    'Sevilla',
    'Valencia',
    'Mallorca'
]

palabras_clave = [
    'turismo',
    'viajar',
    'que hacer',
    'actividades',
    'atracciones',
    'tourism',
    'travel',
    'what to do'
]

## 3️⃣ Guardar sesión de Instagram

In [None]:
async def guardar_sesion_instagram():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context()
        page = await context.new_page()

        await page.goto("https://www.instagram.com/accounts/login/")
        print("👉 Inicia sesión manualmente en la ventana abierta.")

        try:
            await page.wait_for_selector('article', timeout=120_000)
            print("✅ Sesión iniciada correctamente.")
            await context.storage_state(path="instagram_sesion.json")
            print("💾 Sesión guardada en 'instagram_sesion.json'.")
        except Exception as e:
            print(f"Error durante el login: {e}")

        print("Puedes cerrar el navegador manualmente si quieres.")

# Ejecutar una vez para guardar sesión
# await guardar_sesion_instagram()

## 4️⃣ Manejo de error 429 (Too Many Requests)

In [None]:
async def handle_429(page, max_attempts=5):
    for attempt in range(max_attempts):
        try:
            error_429 = await page.query_selector('text="429"') or await page.query_selector('text="Too Many Requests"')
            if error_429:
                wait_time = 2 ** (attempt + 6)
                print(f"⚠️ Error 429 detectado. Esperando {wait_time} segundos (intento {attempt+1}/{max_attempts})...")
                await asyncio.sleep(wait_time)
                return False
            return True
        except Exception as e:
            print(f"Error verificando 429: {e}")
            wait_time = 2 ** (attempt + 6)
            print(f"Esperando {wait_time} segundos...")
            await asyncio.sleep(wait_time)
            return False
    print("❌ Máximo de reintentos alcanzado para 429.")
    return True

## 5️⃣ Función para scrapear posts de un hashtag

In [None]:
async def scrape_instagram_posts(query, max_posts=50):
    hashtag = query.replace(' ', '').lower()
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(storage_state="instagram_sesion.json")
        page = await context.new_page()

        hashtag_url = f"https://www.instagram.com/explore/tags/{hashtag}/"
        try:
            await page.goto(hashtag_url)
            if not await handle_429(page):
                await browser.close()
                return []
            print(f"🔎 Navegando a hashtag: #{hashtag}")
        except Exception as e:
            print(f"Error navegando a {hashtag_url}: {e}")
            await browser.close()
            return []

        try:
            await page.wait_for_selector('main[role="main"]', timeout=30_000)
            await page.wait_for_timeout(random.uniform(10000, 15000))
        except Exception as e:
            print(f"Error cargando página de posts: {e}")
            await browser.close()
            return []

        post_links = set()
        scrolls = 0
        sin_cambios = 0

        while len(post_links) < max_posts and scrolls < 50:
            nuevos_links_count = len(post_links)
            await page.mouse.wheel(0, random.randint(1000, 3000))
            await page.wait_for_timeout(random.uniform(5000, 10000))
            await page.mouse.wheel(0, 5000)
            await page.wait_for_timeout(random.uniform(20000, 40000))
            links = await page.query_selector_all('a[role="link"][tabindex="0"]')
            for link in links:
                href = await link.get_attribute('href')
                if href and ("/p/" in href or "/reel/" in href):
                    post_links.add(href)
                    if len(post_links) >= max_posts:
                        break
            if len(post_links) == nuevos_links_count:
                sin_cambios += 1
            else:
                sin_cambios = 0
            if sin_cambios >= 3:
                print("No hay más posts nuevos, deteniendo.")
                break
            scrolls += 1
            print(f"Scroll {scrolls}: {len(post_links)} links recopilados")

        posts = []
        for i, href in enumerate(list(post_links)[:max_posts], 1):
            full_url = f"https://www.instagram.com{href}"
            print(f"Procesando post {i}/{min(len(post_links), max_posts)}: {full_url}")
            post_page = await context.new_page()
            try:
                await post_page.goto(full_url)
                if not await handle_429(post_page):
                    await post_page.close()
                    continue
                await post_page.wait_for_load_state('networkidle', timeout=30_000)
                await post_page.wait_for_timeout(random.uniform(5000, 10000))

                texto_el = await post_page.query_selector('span[class*="x193iq5w"]')
                texto = (await texto_el.inner_text()).strip() if texto_el else ""
                fecha_el = await post_page.query_selector('time[datetime]')
                fecha_text = await fecha_el.get_attribute('datetime') if fecha_el else datetime.now().strftime("%Y-%m-%dT%H:%M:%S")

                if texto:
                    posts.append({"texto": texto, "fecha": fecha_text})
            except Exception as e:
                print(f"Error extrayendo {full_url}: {e}")
            finally:
                await post_page.close()

        await browser.close()
        print(f"✅ Total posts registrados: {len(posts)}")
        return posts

## 6️⃣ Ejecutar scraping para todas las ciudades y guardar JSON

In [None]:
async def main():
    resultados = {}
    for ciudad in ciudades:
        print(f"\n🌍 Procesando ciudad: {ciudad}")
        posts_ciudad = []
        for palabra in palabras_clave:
            query = f"{ciudad} {palabra}"
            print(f"   ➡️ Buscando: {query}")
            posts = await scrape_instagram_posts(query, max_posts=30)
            posts_ciudad.extend(posts)
        # limpiar duplicados (texto+fecha)
        vistos = set()
        posts_unicos = []
        for p in posts_ciudad:
            key = (p['texto'], p['fecha'])
            if key not in vistos:
                vistos.add(key)
                posts_unicos.append(p)
        resultados[ciudad] = posts_unicos
        print(f"✅ {ciudad}: {len(posts_unicos)} posts únicos guardados")

    with open("instagram_resultados.json", "w", encoding="utf-8") as f:
        json.dump(resultados, f, indent=4, ensure_ascii=False)
    print("\n🎉 Scraping finalizado. Resultados guardados en instagram_resultados.json")


## 5️⃣ Ejecutar el programa

In [None]:
await main()


🌍 Procesando ciudad: Tenerife
   ➡️ Buscando: Tenerife turismo
🔎 Navegando a hashtag: #tenerifeturismo
Scroll 1: 30 links recopilados
Procesando post 1/30: https://www.instagram.com/p/CiVJHqqobac/
Error extrayendo https://www.instagram.com/p/CiVJHqqobac/: Timeout 30000ms exceeded.
Procesando post 2/30: https://www.instagram.com/p/CXKDQZfoJI-/
Procesando post 3/30: https://www.instagram.com/p/CTu1E0qqd5r/
Procesando post 4/30: https://www.instagram.com/p/CnM5mvAtkol/
Procesando post 5/30: https://www.instagram.com/p/DGkXzz6o1jr/
Procesando post 6/30: https://www.instagram.com/p/DBmW0lZIs8j/
Procesando post 7/30: https://www.instagram.com/p/CVIAXL0tPQA/
Procesando post 8/30: https://www.instagram.com/p/CrXw70sNUsb/
Procesando post 9/30: https://www.instagram.com/p/CsRPvl_Nkbk/
Procesando post 10/30: https://www.instagram.com/p/Cab105PMKSL/
Procesando post 11/30: https://www.instagram.com/p/C9zQzmdob8b/
Procesando post 12/30: https://www.instagram.com/p/C55q1qgo6Oe/
Procesando post 13/30