In [1]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
from langchain.document_loaders import DirectoryLoader, TextLoader
import json
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
# import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import os
from urllib.parse import urljoin, urlparse
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Set, List, Dict
import re

class CompleteTerraformScraper:
    def __init__(self, base_url: str = "https://developer.hashicorp.com/terraform"):
        self.base_url = base_url
        self.visited_urls: Set[str] = set()
        self.scraped_data: List[Dict] = []
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def is_valid_terraform_url(self, url: str) -> bool:
        """Verifica si la URL pertenece a cualquier sección de Terraform"""
        valid_patterns = [
            "https://developer.hashicorp.com/terraform/docs",
            "https://developer.hashicorp.com/terraform/tutorials",
            "https://developer.hashicorp.com/terraform/language",
            "https://developer.hashicorp.com/terraform/language/providers",
            "https://developer.hashicorp.com/terraform/cli",
            "https://developer.hashicorp.com/terraform/docs/partnerships",
            "https://developer.hashicorp.com/terraform/cdktf",
            "https://developer.hashicorp.com/terraform/cloud-docs",
            "https://developer.hashicorp.com/terraform/enterprise",
            "https://developer.hashicorp.com/terraform/plugin",
            "https://developer.hashicorp.com/terraform/registry",
            "https://developer.hashicorp.com/terraform/internals",
            "https://developer.hashicorp.com/terraform/migrate",
            "https://developer.hashicorp.com/terraform/intro",
            "https://developer.hashicorp.com/terraform/configuration"
        ]

        # Verificar que la URL comience con el dominio de Terraform
        if not url.startswith("https://developer.hashicorp.com/terraform"):
            return False

        # Excluir URLs no deseadas
        excluded_patterns = [
            "/api/",
            "/downloads",
            "/install",
            "github.com",
            "mailto:",
            ".zip",
            ".tar.gz",
            "#"
        ]

        for pattern in excluded_patterns:
            if pattern in url:
                return False

        return True

    def get_main_sections(self) -> Set[str]:
        """Obtiene todas las secciones principales de Terraform"""
        main_sections = {
            "https://developer.hashicorp.com/terraform/docs",
            "https://developer.hashicorp.com/terraform/tutorials",
            "https://developer.hashicorp.com/terraform/language",
            "https://developer.hashicorp.com/terraform/language/providers",
            "https://developer.hashicorp.com/terraform/cli",
            "https://developer.hashicorp.com/terraform/docs/partnerships",
            "https://developer.hashicorp.com/terraform/cdktf",
            "https://developer.hashicorp.com/terraform/cloud-docs",
            "https://developer.hashicorp.com/terraform/enterprise",
            "https://developer.hashicorp.com/terraform/plugin",
            "https://developer.hashicorp.com/terraform/registry",
            "https://developer.hashicorp.com/terraform/internals",
            "https://developer.hashicorp.com/terraform/migrate",
            "https://developer.hashicorp.com/terraform/intro",
            "https://developer.hashicorp.com/terraform/configuration"
        }

        # También obtener enlaces dinámicamente de la página principal
        try:
            response = self.session.get(self.base_url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(self.base_url, link['href'])
                clean_url = absolute_url.split('#')[0].split('?')[0]

                if self.is_valid_terraform_url(clean_url):
                    main_sections.add(clean_url)

        except Exception as e:
            print(f"Error obteniendo secciones principales: {str(e)}")

        return main_sections

    def get_all_links(self, url: str) -> Set[str]:
        """Extrae todos los enlaces de una página"""
        try:
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            links = set()

            # Buscar enlaces en navegación y contenido
            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(url, link['href'])
                clean_url = absolute_url.split('#')[0].split('?')[0]

                if self.is_valid_terraform_url(clean_url):
                    links.add(clean_url)

            # Buscar enlaces específicos en sidebars y menús
            nav_selectors = [
                'nav a[href]',
                '.sidebar a[href]',
                '.navigation a[href]',
                '.menu a[href]',
                '.docs-nav a[href]',
                '[data-testid="sidebar"] a[href]'
            ]

            for selector in nav_selectors:
                for link in soup.select(selector):
                    if link.get('href'):
                        absolute_url = urljoin(url, link['href'])
                        clean_url = absolute_url.split('#')[0].split('?')[0]

                        if self.is_valid_terraform_url(clean_url):
                            links.add(clean_url)

            return links

        except Exception as e:
            print(f"Error obteniendo enlaces de {url}: {str(e)}")
            return set()

    def scrape_page_content(self, url: str) -> Dict:
        """Extrae el contenido principal de una página"""
        try:
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extraer título
            title = ""
            title_selectors = ['h1', 'title', '.page-title', '.content-title']
            for selector in title_selectors:
                title_elem = soup.select_one(selector)
                if title_elem:
                    title = title_elem.get_text(strip=True)
                    break

            # Extraer contenido principal con múltiples selectores
            content = ""
            content_selectors = [
                'main',
                'article',
                '.content',
                '.main-content',
                '.page-content',
                '.docs-content',
                '.tutorial-content',
                '[role="main"]'
            ]

            main_content = None
            for selector in content_selectors:
                main_content = soup.select_one(selector)
                if main_content:
                    break

            # Si no encuentra selectores específicos, usar el body
            if not main_content:
                main_content = soup.find('body')

            if main_content:
                # Remover elementos no deseados
                for element in main_content.find_all([
                    'script', 'style', 'nav', 'footer', 'header',
                    '.advertisement', '.ads', '.sidebar', '.navigation',
                    '[data-testid="sidebar"]', '.breadcrumb'
                ]):
                    element.decompose()

                # Extraer texto
                content = main_content.get_text(separator='\n', strip=True)
                # Limpiar texto
                content = re.sub(r'\n{3,}', '\n\n', content)
                content = re.sub(r'[ \t]+', ' ', content)

            # Extraer metadatos
            meta_description = ""
            meta_elem = soup.find('meta', attrs={'name': 'description'})
            if meta_elem:
                meta_description = meta_elem.get('content', '')

            # Extraer sección/categoría de la URL
            url_parts = url.replace(self.base_url, '').strip('/').split('/')
            section = url_parts[0] if url_parts and url_parts[0] else 'main'

            return {
                'url': url,
                'title': title,
                'content': content,
                'meta_description': meta_description,
                'section': section,
                'subsection': '/'.join(url_parts[1:]) if len(url_parts) > 1 else '',
                'word_count': len(content.split()),
                'scraped_at': time.strftime('%Y-%m-%d %H:%M:%S')
            }

        except Exception as e:
            print(f"Error scrapeando {url}: {str(e)}")
            return None

    def discover_all_urls(self, max_depth: int = 4) -> Set[str]:
        """Descubre todas las URLs de todas las secciones de Terraform"""
        # Comenzar con las secciones principales
        initial_urls = self.get_main_sections()
        print(f"Secciones principales encontradas: {len(initial_urls)}")

        urls_to_visit = initial_urls.copy()
        all_urls = set()

        for depth in range(max_depth):
            print(f"Descubriendo URLs - Profundidad {depth + 1}/{max_depth}")
            print(f"URLs por visitar en esta profundidad: {len(urls_to_visit)}")

            new_urls = set()

            for url in urls_to_visit:
                if url not in all_urls:
                    print(f"Explorando: {url}")
                    links = self.get_all_links(url)
                    new_urls.update(links)
                    all_urls.add(url)
                    time.sleep(0.3)  # Pausa más corta

            urls_to_visit = new_urls - all_urls
            print(f"Nuevas URLs encontradas: {len(new_urls)}")

            if not urls_to_visit:
                print("No hay más URLs por descubrir")
                break

        print(f"Total de URLs descubiertas: {len(all_urls)}")
        return all_urls

    def scrape_all_pages(self, urls: Set[str], max_workers: int = 4):
        """Scrapea todas las páginas usando threading"""
        print(f"Iniciando scraping de {len(urls)} páginas...")

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_url = {executor.submit(self.scrape_page_content, url): url for url in urls}

            for i, future in enumerate(as_completed(future_to_url), 1):
                url = future_to_url[future]
                try:
                    page_data = future.result()
                    if page_data and page_data['content'] and len(page_data['content']) > 100:
                        self.scraped_data.append(page_data)
                        print(f"Progreso: {i}/{len(urls)} - {page_data['section']} - {page_data['title'][:50]}...")

                except Exception as e:
                    print(f"Error procesando {url}: {str(e)}")

                time.sleep(0.1)

    def save_to_files(self, output_dir: str = "terraform_complete"):
        """Guarda los datos extraídos organizados por sección"""
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Guardar JSON completo
        json_file = os.path.join(output_dir, "terraform_complete.json")
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(self.scraped_data, f, indent=2, ensure_ascii=False)

        # Organizar por secciones
        sections = {}
        for page in self.scraped_data:
            section = page['section']
            if section not in sections:
                sections[section] = []
            sections[section].append(page)

        # Crear archivos por sección
        for section, pages in sections.items():
            section_dir = os.path.join(output_dir, section)
            if not os.path.exists(section_dir):
                os.makedirs(section_dir)

            # JSON por sección
            section_json = os.path.join(section_dir, f"{section}.json")
            with open(section_json, 'w', encoding='utf-8') as f:
                json.dump(pages, f, indent=2, ensure_ascii=False)

            # Archivos de texto individuales
            text_dir = os.path.join(section_dir, "text_files")
            if not os.path.exists(text_dir):
                os.makedirs(text_dir)

            for i, page in enumerate(pages):
                filename = re.sub(r'[^\w\-_.]', '_', page['title'])[:50]
                filename = f"{i:03d}_{filename}.txt"

                file_path = os.path.join(text_dir, filename)
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(page['content'])

        # Estadísticas por sección
        stats_file = os.path.join(output_dir, "estadisticas.txt")
        with open(stats_file, 'w', encoding='utf-8') as f:
            f.write("ESTADÍSTICAS COMPLETAS DE TERRAFORM\n")
            f.write("=" * 50 + "\n\n")

            total_pages = len(self.scraped_data)
            total_words = sum(page['word_count'] for page in self.scraped_data)

            f.write(f"Total de páginas: {total_pages}\n")
            f.write(f"Total de palabras: {total_words}\n")
            f.write(f"Promedio de palabras por página: {total_words // total_pages if total_pages else 0}\n\n")

            f.write("PÁGINAS POR SECCIÓN:\n")
            f.write("-" * 30 + "\n")
            for section, pages in sections.items():
                section_words = sum(page['word_count'] for page in pages)
                f.write(f"{section}: {len(pages)} páginas, {section_words} palabras\n")

        print(f"Datos guardados en {output_dir}")
        print(f"Total de páginas: {len(self.scraped_data)}")
        print(f"Secciones encontradas: {list(sections.keys())}")

def main():
    # Crear el scraper
    scraper = CompleteTerraformScraper()

    # Descubrir todas las URLs de todas las secciones
    all_urls = scraper.discover_all_urls(max_depth=2)

    # Scrapear todas las páginas
    scraper.scrape_all_pages(all_urls, max_workers=3)

    # Guardar los resultados organizados
    scraper.save_to_files("knowledge-base-terraform")

    # Mostrar estadísticas finales
    if scraper.scraped_data:
        sections = {}
        for page in scraper.scraped_data:
            section = page['section']
            sections[section] = sections.get(section, 0) + 1

        total_words = sum(page['word_count'] for page in scraper.scraped_data)
        print(f"\nEstadísticas finales:")
        print(f"Páginas scrapeadas: {len(scraper.scraped_data)}")
        print(f"Total de palabras: {total_words}")
        print(f"Secciones cubiertas: {list(sections.keys())}")
        for section, count in sections.items():
            print(f"  {section}: {count} páginas")

if __name__ == "__main__":
        main()

In [14]:
# sections = glob.glob("knowledge-base/terraform_complete/*")
#
# sections.remove('knowledge-base/terraform_complete\\terraform_complete.json')
# sections.remove('knowledge-base/terraform_complete\\estadisticas.txt')
#
# files = []
# metadata_jsons = []
#
# for sec in sections:
#     file = glob.glob(f"{sec}/text_files/*")
#     files.append(file)
#
#     metadata_json = glob.glob(f"{sec}/*.json")
#     metadata_jsons.append(metadata_json)
#
# print(files)

[['knowledge-base/terraform_complete\\cdktf/text_files\\000_What_is_CDK_for_Terraform_.txt', 'knowledge-base/terraform_complete\\cdktf/text_files\\001_CDK_for_Terraform.txt', 'knowledge-base/terraform_complete\\cdktf/text_files\\002_CDK_for_Terraform.txt', 'knowledge-base/terraform_complete\\cdktf/text_files\\003_HCL_Interoperability.txt', 'knowledge-base/terraform_complete\\cdktf/text_files\\004_Releases.txt', 'knowledge-base/terraform_complete\\cdktf/text_files\\005_CDK_for_Terraform.txt', 'knowledge-base/terraform_complete\\cdktf/text_files\\006_CDK_for_Terraform.txt', 'knowledge-base/terraform_complete\\cdktf/text_files\\007_CDK_for_Terraform.txt', 'knowledge-base/terraform_complete\\cdktf/text_files\\008_Providers.txt', 'knowledge-base/terraform_complete\\cdktf/text_files\\009_CDK_for_Terraform.txt', 'knowledge-base/terraform_complete\\cdktf/text_files\\010_Resources.txt', 'knowledge-base/terraform_complete\\cdktf/text_files\\011_Telemetry.txt', 'knowledge-base/terraform_complete\

In [16]:
folders = glob.glob("knowledge-base-terraform/*")
folders.remove('knowledge-base-terraform\\terraform_complete.json')
folders.remove('knowledge-base-terraform\\estadisticas.txt')

text_loader_kwargs = {'encoding': 'utf-8'}

def add_metadata(doc, metadata):
    doc.metadata['url'] = metadata['url']
    doc.metadata['title'] = metadata['title']
    doc.metadata['content'] = metadata['content']
    doc.metadata['meta_description'] = metadata['meta_description']
    doc.metadata['section'] = metadata['section']
    doc.metadata['subsection'] = metadata['subsection']

    return doc

In [17]:
documents = []

for folder in folders:
    metadata_json = glob.glob(f"{folder}/*.json")[0]
    with open(metadata_json, 'r', encoding='utf-8') as f:
        metadata = json.load(f)

    loader = DirectoryLoader(folder, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)

    folder_docs = loader.load()

    for doc in folder_docs:
        documents.extend([add_metadata(doc, data) for data in metadata])

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")



Total number of chunks: 30805


In [20]:
# embeddings = OpenAIEmbeddings()


# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

embeddings = OpenAIEmbeddings(model)


In [25]:
Chroma(
    persist_directory="src/rag/vector_db",
    embedding_function=embeddings
).delete_collection()


In [26]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="src/rag/vector_db")

print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 30805 documents


In [32]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

# Obtener los vectores y metadatos
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']

# Usar la sección como tipo de documento
doc_types = [meta.get('section', 'unknown') for meta in metadatas]

# Asignar un color único por sección
unique_sections = list(set(doc_types))
color_map = {section: f"hsl({i * 360 // len(unique_sections)},70%,50%)" for i, section in enumerate(unique_sections)}
colors = [color_map[section] for section in doc_types]

print(f"Secciones encontradas: {unique_sections}")
print(f"Ejemplo de colores asignados: {color_map}")
#
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

There are 30,805 vectors with 384 dimensions in the vector store
Secciones encontradas: ['enterprise', 'migrate', 'language', 'main', 'plugin', 'intro', 'docs', 'internals', 'cloud-docs', 'registry', 'cli', 'cdktf', 'tutorials']
Ejemplo de colores asignados: {'enterprise': 'hsl(0,70%,50%)', 'migrate': 'hsl(27,70%,50%)', 'language': 'hsl(55,70%,50%)', 'main': 'hsl(83,70%,50%)', 'plugin': 'hsl(110,70%,50%)', 'intro': 'hsl(138,70%,50%)', 'docs': 'hsl(166,70%,50%)', 'internals': 'hsl(193,70%,50%)', 'cloud-docs': 'hsl(221,70%,50%)', 'registry': 'hsl(249,70%,50%)', 'cli': 'hsl(276,70%,50%)', 'cdktf': 'hsl(304,70%,50%)', 'tutorials': 'hsl(332,70%,50%)'}


In [5]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory="src/rag/vector_db")
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

NameError: name 'documents' is not defined

In [3]:
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4o-mini")
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever()
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)


NameError: name 'vectorstore' is not defined

In [36]:
query = "Please explain what Insurellm is in a couple of sentences"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

Terraform es una herramienta de infraestructura como código (IaC) que permite a los usuarios definir y gestionar infraestructura mediante archivos de configuración. Utiliza un lenguaje de configuración declarativo para describir los recursos que se desean crear, como máquinas virtuales, redes y bases de datos, entre otros. Terraform permite a los usuarios automatizar la provisión y gestión de recursos en múltiples proveedores de servicios en la nube, como AWS, Azure y Google Cloud, así como en entornos locales. La herramienta también facilita la definición de dependencias entre recursos y la creación de múltiples recursos similares a partir de un solo bloque de configuración.


In [None]:
# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

Secciones principales encontradas: 30
Descubriendo URLs - Profundidad 1/2
URLs por visitar en esta profundidad: 30
Explorando: https://developer.hashicorp.com/terraform/cdktf
Explorando: https://developer.hashicorp.com/terraform/tutorials/azure-get-started
Explorando: https://developer.hashicorp.com/terraform/docs
Explorando: https://developer.hashicorp.com/terraform/tutorials/docker-get-started
Explorando: https://developer.hashicorp.com/terraform/plugin
Explorando: https://developer.hashicorp.com/terraform/language/providers
Explorando: https://developer.hashicorp.com/terraform
Explorando: https://developer.hashicorp.com/terraform/internals
Explorando: https://developer.hashicorp.com/terraform/docs/partnerships
Explorando: https://developer.hashicorp.com/terraform/tutorials/networking/multicloud-kubernetes
Explorando: https://developer.hashicorp.com/terraform/tutorials/applications/preview-environments-vercel
Explorando: https://developer.hashicorp.com/terraform/cloud-docs
Explorando