<a href="https://colab.research.google.com/github/alartuka/Arven/blob/main/src/backend/aven_site_crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aven Website Comprehensive Crawler
### Run this notebook in Google Colab to crawl and store Aven website content in Pinecone



---



## Install Libraries

In [None]:
!pip install python-dotenv sentence-transformers scikit-learn pinecone-client exa-py requests numpy


## Imports

In [None]:
import os
from google.colab import userdata
import numpy as np
from sentence_transformers import SentenceTransformer
import hashlib
import time
from urllib.parse import urlparse
import requests
import xml.etree.ElementTree as ET
from pinecone import Pinecone
from exa_py import Exa


## Environment Variables Setup

In [None]:
pinecone_api_key = userdata.get("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key

exa_api_key = userdata.get("EXA_API_KEY")
os.environ['EXA_API_KEY'] = exa_api_key


## Initialize Clients


In [None]:
# Initialize sentence transformer model
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Initialize Exa client
exa_client = Exa(api_key=exa_api_key)

# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)
index_name = "arven"
pinecone_index = pc.Index(index_name)



## Create Text Embeddings

In [None]:
def get_huggingface_embeddings(text):
    """Get embeddings for text using the loaded model"""
    global sentence_model
    return sentence_model.encode(text)


## Split Text into Chunks

In [None]:
def split_text_into_chunks(text, max_chunk_size=1000, overlap=100):
    """Split text into overlapping chunks for better retrieval"""
    if len(text) <= max_chunk_size:
        return [text]

    chunks = []
    start = 0

    while start < len(text):
        end = start + max_chunk_size

        # if we're not at the end, try to break at a sentence or word boundary
        if end < len(text):
            # look for sentence ending
            sentence_break = text.rfind('.', start, end)
            if sentence_break > start + max_chunk_size // 2:
                end = sentence_break + 1
            else:
                # look for word boundary
                word_break = text.rfind(' ', start, end)
                if word_break > start + max_chunk_size // 2:
                    end = word_break

        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)

        # move start position with overlap
        start = max(start + max_chunk_size - overlap, end)

        # prevent infinite loop
        if start >= len(text):
            break

    return chunks

## Retrieve Aven's Sitemap URLs

In [None]:
def get_sitemap_urls():
    """Fetch and parse sitemap.xml to get all URLs"""
    sitemap_url = "https://aven.com/sitemap.xml"
    urls = []

    try:
        print(f"🔍 Fetching sitemap from {sitemap_url}...")
        response = requests.get(sitemap_url, timeout=30)
        response.raise_for_status()

        # parse XML
        root = ET.fromstring(response.content)

        # handle different sitemap formats
        # standard sitemap namespace
        ns = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

        # try to find URL elements with namespace
        url_elements = root.findall('.//sitemap:url/sitemap:loc', ns)

        # if no namespace URLs found, try without namespace
        if not url_elements:
            url_elements = root.findall('.//url/loc')

        # if still no URLs, try different approach for sitemap index
        if not url_elements:
            url_elements = root.findall('.//loc')

        for url_elem in url_elements:
            url = url_elem.text.strip()
            if url and url.startswith('https://aven.com'):
                urls.append(url)

        print(f">>> Found {len(urls)} URLs in sitemap")
        return urls

    except Exception as e:
        print(f">>> Error fetching sitemap: {e}")
        print(">>> Falling back to base URL crawling...")
        return ["https://aven.com"]

## Crawl Aven's Website with Exa and Store in Pinecone

In [None]:
def crawl_and_store_website():
    """Crawl Aven website using Exa and store in Pinecone - COMPREHENSIVE SITEMAP"""
    global exa_client, pinecone_index

    try:
        print("🕷️ Starting comprehensive website crawl from sitemap...")

        # get all URLs from sitemap
        sitemap_urls = get_sitemap_urls()

        # process URLs in batches to avoid API limits
        batch_size = 25  # process 25 URLs at a time
        all_processed_urls = []
        vectors_to_upsert = []
        total_processed_count = 0

        for i in range(0, len(sitemap_urls), batch_size):
            batch_urls = sitemap_urls[i:i + batch_size]
            batch_num = i // batch_size + 1
            total_batches = (len(sitemap_urls) + batch_size - 1) // batch_size

            print(f">>> Processing batch {batch_num}/{total_batches} ({len(batch_urls)} URLs)...")

            try:
                # crawl this batch of URLs using Exa
                response = exa_client.get_contents(
                    batch_urls,           # list of URLs to crawl
                    text=True,           # get the full text
                    subpages=0           # don't crawl subpages since we have the sitemap
                )

                batch_results = response.results
                print(f"🔍 Got {len(batch_results)} results from Exa for this batch")

                # process each page in the batch
                for result in batch_results:
                    try:
                        # extract content and metadata
                        url = result.url
                        title = getattr(result, 'title', 'Unknown Title')
                        text_content = getattr(result, 'text', '')

                        # skip if no content
                        if not text_content or len(text_content.strip()) < 50:
                            print(f">>>  Skipping {url} - insufficient content")
                            continue

                        # verify it's from Aven domain
                        domain = urlparse(url).netloc.lower()
                        if not (domain == 'aven.com' or domain.endswith('.aven.com')):
                            print(f">>>  Skipping {url} - not from Aven domain")
                            continue

                        # split content into chunks (for better retrieval)
                        chunks = split_text_into_chunks(text_content, max_chunk_size=1000, overlap=100)

                        for chunk_idx, chunk in enumerate(chunks):
                            # generate embedding for the chunk
                            chunk_embedding = get_huggingface_embeddings(chunk)

                            # create unique ID for this chunk
                            chunk_id = hashlib.md5(f"{url}_{chunk_idx}_{chunk[:100]}".encode()).hexdigest()

                            # prepare metadata
                            metadata = {
                                'source': url,
                                'title': title,
                                'text': chunk,
                                'page_content': chunk,  # alternative field name
                                'content': chunk,       # another alternative field name
                                'domain': domain,
                                'verified_aven': True,
                                'company': 'Aven',
                                'chunk_index': chunk_idx,
                                'total_chunks': len(chunks),
                                'crawl_timestamp': int(time.time()),
                                'source_type': 'exa_sitemap_crawl_colab',
                                'batch_number': batch_num,
                                'crawl_method': 'colab_standalone'
                            }

                            # prepare vector for upsert
                            vector_data = {
                                'id': chunk_id,
                                'values': chunk_embedding.tolist(),
                                'metadata': metadata
                            }

                            vectors_to_upsert.append(vector_data)

                        all_processed_urls.append(url)
                        total_processed_count += 1
                        print(f">>> Processed {url} - {len(chunks)} chunks")

                    except Exception as e:
                        print(f">>> Error processing {getattr(result, 'url', 'unknown URL')}: {e}")
                        continue

                # small delay between batches to be respectful to APIs
                if i + batch_size < len(sitemap_urls):
                    print(">>>  Waiting 2 seconds before next batch...")
                    time.sleep(2)

            except Exception as e:
                print(f">>> Error processing batch {batch_num}: {e}")
                continue

        # upsert all vectors to Pinecone in batches
        if vectors_to_upsert:
            print(f">>> Upserting {len(vectors_to_upsert)} vectors to Pinecone...")

            pinecone_batch_size = 100
            for i in range(0, len(vectors_to_upsert), pinecone_batch_size):
                batch = vectors_to_upsert[i:i + pinecone_batch_size]
                pinecone_index.upsert(
                    vectors=batch,
                    namespace="company-documents"
                )
                print(f">>> Upserted batch {i//pinecone_batch_size + 1}/{(len(vectors_to_upsert) + pinecone_batch_size - 1)//pinecone_batch_size}")

            print(f">>> Successfully crawled and stored {total_processed_count} pages ({len(vectors_to_upsert)} chunks)")
            print(f">>> Processed URLs: {len(all_processed_urls)}/{len(sitemap_urls)}")

            # summary statistics
            return {
                'success': True,
                'pages_processed': total_processed_count,
                'chunks_stored': len(vectors_to_upsert),
                'total_sitemap_urls': len(sitemap_urls),
                'processed_urls': all_processed_urls,
                'timestamp': int(time.time())
            }
        else:
            print(">>>  No content was processed and stored")
            return {
                'success': False,
                'error': 'No content was processed and stored',
                'pages_processed': 0,
                'chunks_stored': 0,
                'total_sitemap_urls': len(sitemap_urls)
            }

    except Exception as e:
        print(f">>> Website crawl error: {e}")
        return {
            'success': False,
            'error': str(e),
            'pages_processed': 0,
            'chunks_stored': 0
        }

## Pinecone Stored Data Verification

In [None]:
def verify_stored_data():
    """Verify that data was stored correctly in Pinecone"""
    global pinecone_index

    try:
        print(">>> Verifying stored data...")

        # Get index stats
        stats = pinecone_index.describe_index_stats()
        namespace_stats = stats.namespaces.get("company-documents", {})
        vector_count = namespace_stats.get('vector_count', 0)

        print(f">>> Total vectors in 'company-documents' namespace: {vector_count}")

        # Sample some data
        import random
        random_vector = [random.random() for _ in range(384)]

        results = pinecone_index.query(
            vector=random_vector,
            top_k=5,
            namespace="company-documents",
            include_metadata=True
        )

        print(f">>> Sample of stored data:")
        aven_count = 0

        for i, match in enumerate(results.matches):
            metadata = match.metadata
            source = metadata.get('source', 'Unknown')
            title = metadata.get('title', 'Unknown')[:50]
            domain = urlparse(source).netloc.lower() if source else 'unknown'
            is_aven = domain == 'aven.com' or domain.endswith('.aven.com')

            if is_aven:
                aven_count += 1

            print(f"  {i+1}. {title} | {domain} | {'>> Aven' if is_aven else '>> Non-Aven'}")

        verification_result = {
            'total_vectors': vector_count,
            'sample_size': len(results.matches),
            'aven_sources': aven_count,
            'aven_percentage': (aven_count / len(results.matches) * 100) if results.matches else 0,
            'is_properly_filtered': aven_count == len(results.matches)
        }

        print(f">>> Verification complete: {aven_count}/{len(results.matches)} sources are from Aven")
        return verification_result

    except Exception as e:
        print(f">>> Verification failed: {e}")
        return {'error': str(e)}

## MAIN - Comprehensive Crawling Execution

In [None]:
print(">>> Starting comprehensive crawl of Aven website...")
print("This may take several minutes depending on the website size.")
print("\n" + "="*60)

start_time = time.time()

# Execute the crawl
crawl_results = crawl_and_store_website()

end_time = time.time()
duration = end_time - start_time

print("\n" + "="*60)
print("=== CRAWL COMPLETED! ===")
print("="*60)

if crawl_results['success']:
    print(f">>> Success! Crawled and stored website content.")
    print(f">>> Pages processed: {crawl_results['pages_processed']}")
    print(f">>> Chunks stored: {crawl_results['chunks_stored']}")
    print(f">>> Total sitemap URLs: {crawl_results['total_sitemap_urls']}")
    print(f">>> Duration: {duration:.1f} seconds")
    print(f">>> Completed at: {time.strftime('%Y-%m-%d %H:%M:%S')}")

    # Run verification
    print("\n>>> Running verification...")
    verification = verify_stored_data()
    if 'error' not in verification:
        print(f">>> Verification Results:")
        print(f"   Total vectors: {verification['total_vectors']}")
        print(f"   Aven sources: {verification['aven_percentage']:.1f}%")
        print(f"   Filtering status: {'>> Success' if verification['is_properly_filtered'] else '>>  Some non-Aven content detected'}")
else:
    print(f">> Crawl failed: {crawl_results.get('error', 'Unknown error')}")


## Optional

In [None]:
# =============================================================================
# MANUAL VERIFICATION
# =============================================================================

# to manually verify the stored data
"""
print(">>>  Manual verification of stored data...")
verification_results = verify_stored_data()
print("\nVerification Results:")
for key, value in verification_results.items():
    print(f"  {key}: {value}")
"""

# =============================================================================
# VIEW SITEMAP URLS
# =============================================================================

# to see what URLs were found in the sitemap
"""
print(">>>  Fetching sitemap URLs for inspection...")
urls = get_sitemap_urls()
print(f"\nFound {len(urls)} URLs in sitemap:")
for i, url in enumerate(urls[:20], 1):  # Show first 20 URLs
    print(f"  {i}. {url}")
if len(urls) > 20:
    print(f"  ... and {len(urls) - 20} more URLs")
"""