In [20]:
# Si no lo has hecho, instala Playwright y los navegadores
# !pip install playwright nest_asyncio sentence-transformers
# !playwright install

import nest_asyncio
nest_asyncio.apply()


In [21]:
import asyncio
import json
import re
from pathlib import Path
from playwright.async_api import async_playwright
from sentence_transformers import SentenceTransformer
import numpy as np

BASE_URL = "https://www.discogs.com"
OUTPUT_FILE = "music_data.json"
MAX_PAGES = 1
EMBED_MODEL = "all-MiniLM-L6-v2"
HEADLESS = True


In [24]:
async def scrape_music_site(max_pages=MAX_PAGES):
    print("🎵 Iniciando scraping musical en Discogs...")
    results = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=HEADLESS)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/120.0.0.0 Safari/537.36",
            viewport={"width": 1280, "height": 800},
        )
        page = await context.new_page()
        search_url = f"{BASE_URL}/search/?q=&type=release"
        print(f"🔍 Navegando a: {search_url}")
        await page.goto(search_url)
        await page.wait_for_timeout(3000)

        for page_idx in range(max_pages):
            print(f"\n📄 Procesando página {page_idx + 1}...")
            items = await page.query_selector_all(".card, .search_result, article, .card_release, li")
            if not items:
                print("⚠️ No se encontraron resultados visibles.")
                break

            for idx, it in enumerate(items):
                try:
                    title_el = await it.query_selector("h4, .card__title, .search_result_title, a.card_release_title")
                    title = (await title_el.inner_text()).strip() if title_el else ""

                    artist_el = await it.query_selector(".card__artist, .search_result_artist, .card_release_artist, .artist")
                    artist = (await artist_el.inner_text()).strip() if artist_el else ""

                    anchor = await it.query_selector("a")
                    href = await anchor.get_attribute("href") if anchor else ""
                    if not href or not re.search(r"/release/\d+|/master/\d+", href):
                        continue

                    url = href if href.startswith("http") else (BASE_URL + href)
                    text_blob = " | ".join(filter(None, [title, artist]))
                    doc_id = f"pg{page_idx}_i{idx}"
                    results.append({
                        "doc_id": doc_id,
                        "title": title,
                        "artist": artist,
                        "url": url,
                        "text": text_blob
                    })
                except Exception as e:
                    print(f"⚠️ Error parseando item {idx}: {e}")
                    continue

            try:
                next_btn = await page.query_selector('a[rel="next"], a.pagination_next, .pagination-next, .next')
                if next_btn:
                    next_href = await next_btn.get_attribute("href")
                    if next_href:
                        next_url = next_href if next_href.startswith("http") else (BASE_URL + next_href)
                        print(f"   → Siguiente página: {next_url}")
                        await page.goto(next_url)
                        await page.wait_for_timeout(2000)
                    else:
                        break
                else:
                    break
            except Exception as e:
                print(f"⚠️ Error en paginación: {e}")
                break

        await browser.close()
    print(f"\n✅ Scraping finalizado. Total: {len(results)} elementos extraídos.")
    return results

def embed_music_data(docs, model_name=EMBED_MODEL):
    print("🧠 Generando embeddings con", model_name)
    model = SentenceTransformer(model_name)
    texts = [d.get("text", "") for d in docs]
    embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    for i, d in enumerate(docs):
        d["embedding"] = embeddings[i].tolist()
    print("✅ Embeddings completados.")
    return docs

def save_json(data, filename=OUTPUT_FILE):
    p = Path(filename)
    p.parent.mkdir(parents=True, exist_ok=True)
    with p.open("w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"💾 Datos guardados en {filename}")


In [25]:
docs = asyncio.run(scrape_music_site(max_pages=MAX_PAGES))
embedded_docs = embed_music_data(docs)
save_json(embedded_docs, OUTPUT_FILE)



Task exception was never retrieved
future: <Task finished name='Task-62' coro=<Connection.run() done, defined at c:\Users\bryan\Desktop\proyectosClase\scraping\Scraping171025\.venv\Lib\site-packages\playwright\_impl\_connection.py:303> exception=NotImplementedError()>
Traceback (most recent call last):
  File "C:\Users\bryan\AppData\Local\Programs\Python\Python313\Lib\asyncio\tasks.py", line 304, in __step_run_and_handle_result
    result = coro.send(None)
  File "c:\Users\bryan\Desktop\proyectosClase\scraping\Scraping171025\.venv\Lib\site-packages\playwright\_impl\_connection.py", line 310, in run
    await self._transport.connect()
  File "c:\Users\bryan\Desktop\proyectosClase\scraping\Scraping171025\.venv\Lib\site-packages\playwright\_impl\_transport.py", line 133, in connect
    raise exc
  File "c:\Users\bryan\Desktop\proyectosClase\scraping\Scraping171025\.venv\Lib\site-packages\playwright\_impl\_transport.py", line 120, in connect
    self._proc = await asyncio.create_subprocess

🎵 Iniciando scraping musical en Discogs...


NotImplementedError: 

In [26]:
def rag_console():
    print("💬 RAG Console iniciada. Escribe 'salir' para terminar.\n")
    model = SentenceTransformer(EMBED_MODEL)
    texts = [d["text"] for d in embedded_docs]
    embeddings = np.array([d["embedding"] for d in embedded_docs])

    while True:
        query = input("Pregunta 🎵: ")
        if query.lower() in ("salir", "exit", "quit"):
            print("👋 Cerrando RAG Console.")
            break

        q_emb = model.encode([query], convert_to_numpy=True)
        scores = (embeddings @ q_emb.T).squeeze()
        top_idx = np.argsort(scores)[::-1][:3]

        print("\n🔹 Respuestas más relevantes:")
        for i in top_idx:
            print(f"- {texts[i]} (URL: {embedded_docs[i]['url']})")
        print("\n" + "-"*50 + "\n")

# Puedes probarlo con:
# rag_console()
