# Data Ingestion

In [2]:
import requests
from bs4 import BeautifulSoup
from typing import List, Dict
import time
import re
import json


# Define countries and their sections

def generate_country_entry(name: str) -> Dict:
    """Generate a country entry with standardized URL format."""
    url_name = name.lower().replace(' ', '-').replace('&', '').replace('(', '').replace(')', '').replace('--', '-')
    return {
        "name": name,
        "base_url": f"https://www.noonsite.com/place/{url_name}/",
        "sections": ["", "view/clearance/", "view/security/"]
    }

# Caribbean
caribbean_countries = [
    "Anguilla", "Antigua & Barbuda", "Aruba", "Bahamas", "Barbados", "Bonaire", 
    "British Virgin Islands", "Cayman Islands", "Cuba", "Curacao", "Dominica", 
    "Dominican Republic", "Grenada", "Guadeloupe", "Haiti", "Jamaica", "Martinique", 
    "Montserrat", "Puerto Rico", "Saba", "Sint Maarten", "Spanish Virgin Islands", 
    "St. Barts", "St. Kitts & Nevis", "St. Lucia", "St. Martin", 
    "St. Vincent & the Grenadines", "Statia", "Trinidad & Tobago", "Turks & Caicos", 
    "US Virgin Islands"
]

# Central & North America
central_north_america = [
    "Belize", "Canada", "Costa Rica", "El Salvador", "Guatemala", "Honduras", 
    "Mexico", "Nicaragua", "Panama", "St. Pierre and Miquelon", "USA"
]

# Mediterranean & Black Sea
mediterranean_black_sea = [
    "Albania", "Algeria", "Bosnia", "Bulgaria", "Croatia", "Cyprus", "Egypt", 
    "France", "Georgia", "Gibraltar", "Greece", "Israel", "Italy", "Lebanon", 
    "Libya", "Malta", "Montenegro", "Morocco", "Principality of Monaco", "Romania", 
    "Russia", "Slovenia", "Spain", "Syria", "Tunisia", "Turkey", "Ukraine"
]

# North Indian Ocean & Red Sea
north_indian_ocean = [
    "Bahrain", "Djibouti", "Egypt", "Eritrea", "India", "Jordan", "Kuwait", 
    "Maldives", "Oman", "Qatar", "Saudi Arabia", "Somalia", "Sri Lanka", "Sudan", 
    "United Arab Emirates", "Yemen"
]

# North Pacific Islands
north_pacific_islands = [
    "Federated States of Micronesia", "Guam", "Hawaii", "Kiribati", 
    "Marshall Islands", "Northern Marianas", "Palau (Belau)"
]

# Combine all countries
all_country_names = (
    caribbean_countries + 
    central_north_america + 
    mediterranean_black_sea + 
    north_indian_ocean + 
    north_pacific_islands
)

COUNTRIES = [generate_country_entry(name) for name in all_country_names]

# -----------------------------
# Function to scrape a single page

def scrape_page(url: str) -> str:
    """Scrape content from a single page and stop at 'Next Section'."""
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Remove all navigation, sidebars, headers, footers, and ads
    for element in soup.find_all(['nav', 'header', 'footer', 'aside']):
        element.decompose()
    
    # Remove unwanted elements
    unwanted_selectors = [
        '.navigation', '.menu', '.sidebar', '.widget', '.advertisement',
        '#navigation', '#menu', '#sidebar', '.footer', '.header',
        '[class*="nav"]', '[id*="nav"]', '[class*="menu"]',
        'form', 'button', '.search', '[role="navigation"]',
        '[class*="discount"]', '[class*="flag"]', '[class*="coupon"]'
    ]
    
    for selector in unwanted_selectors:
        for element in soup.select(selector):
            element.decompose()
    
    # Find main content
    main_content = None
    content_selectors = [
        'article',
        'div[class*="content"]',
        'div[class*="entry"]',
        'main',
        '#content'
    ]
    
    for selector in content_selectors:
        main_content = soup.select_one(selector)
        if main_content and len(main_content.get_text(strip=True)) > 200:
            break
    
    if not main_content:
        main_content = soup.body
    
    # Extract text
    text_parts = []
    
    if main_content:
        for element in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'p', 'li', 'ul', 'ol']):
            text = element.get_text(strip=True)
            
            # Stop if we hit "Next Section"
            if 'next section' in text.lower():
                break
            
            # Skip navigation-like text
            if text and len(text) > 10 and not is_navigation_text(text):
                text_parts.append(text)
    
    content_text = '\n\n'.join(text_parts)
    return clean_text(content_text)

def is_navigation_text(text: str) -> bool:
    """Check if text looks like navigation/menu items."""
    navigation_keywords = [
        'login', 'register', 'search', 'navigate to', 'select country',
        'close', 'menu', 'download', 'subscribe', 'notification',
        'cookie', 'privacy policy', 'terms of service', 'buy now',
        'discount', 'coupon', 'yachtflags', 'courtesy flag'
    ]
    text_lower = text.lower()
    
    if len(text) < 30 and text.count('\n') == 0:
        return True
    
    return any(keyword in text_lower for keyword in navigation_keywords)

def clean_text(text: str) -> str:
    """Clean up extracted text."""
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        line = line.strip()
        if len(line) > 15 or (len(line) > 5 and '.' in line):
            if not (len(line) < 40 and line[0].isupper() and line.replace(' ', '').isalpha()):
                cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)

# -----------------------------
# Scrape all sections for a country

def scrape_country(country: Dict) -> List[Dict]:
    """Scrape all sections for a given country and return as separate documents."""
    
    documents = []
    
    for section in country['sections']:
        url = country['base_url'] + section
        section_name = section.replace('view/', '').replace('/', '') or "main"
        
        try:
            content = scrape_page(url)
            
            if content:
                doc = {
                    "id": f"{country['name'].lower().replace(' ', '_')}_{section_name}",
                    "country": country['name'],
                    "section_type": section_name,
                    "url": url,
                    "content": content
                }
                documents.append(doc)
            
            time.sleep(1.5)
            
        except Exception as e:
            print(f"Failed {country['name']} - {section_name}: {e}")
    
    return documents

# -----------------------------
# Scrape all countries

def scrape_all_countries(countries: List[Dict]) -> List[Dict]:
    """Scrape all countries and return list of documents."""
    all_documents = []
    
    for i, country in enumerate(countries, 1):
        print(f"[{i}/{len(countries)}] {country['name']}")
        docs = scrape_country(country)
        all_documents.extend(docs)
    
    return all_documents

# -----------------------------
# Save documents

def save_documents(documents: List[Dict], filename: str = "noonsite_documents.json"):
    """Save documents as JSON."""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(documents, f, indent=2, ensure_ascii=False)
    print(f"\nSaved {len(documents)} documents to {filename}")

def save_elasticsearch_bulk(documents: List[Dict], filename: str = "elasticsearch_bulk.ndjson"):
    """Save as Elasticsearch bulk format."""
    with open(filename, 'w', encoding='utf-8') as f:
        for doc in documents:
            action = {"index": {"_index": "noonsite", "_id": doc["id"]}}
            f.write(json.dumps(action, ensure_ascii=False) + '\n')
            f.write(json.dumps(doc, ensure_ascii=False) + '\n')
    print(f"Saved Elasticsearch bulk to {filename}")

# -----------------------------
# Run 

if __name__ == "__main__":
    print("Starting scraper...\n")
    
    documents = scrape_all_countries(COUNTRIES)
    
    save_documents(documents)
    save_elasticsearch_bulk(documents)
    
    print(f"\nDone! {len(documents)} documents ready")

Starting scraper...

[1/92] Anguilla
[2/92] Antigua & Barbuda
[3/92] Aruba
[4/92] Bahamas
[5/92] Barbados
[6/92] Bonaire
[7/92] British Virgin Islands
[8/92] Cayman Islands
[9/92] Cuba
[10/92] Curacao
[11/92] Dominica
[12/92] Dominican Republic
[13/92] Grenada
[14/92] Guadeloupe
[15/92] Haiti
[16/92] Jamaica
[17/92] Martinique
[18/92] Montserrat
[19/92] Puerto Rico
[20/92] Saba
[21/92] Sint Maarten
[22/92] Spanish Virgin Islands
[23/92] St. Barts
[24/92] St. Kitts & Nevis
[25/92] St. Lucia
[26/92] St. Martin
[27/92] St. Vincent & the Grenadines
[28/92] Statia
[29/92] Trinidad & Tobago
[30/92] Turks & Caicos
[31/92] US Virgin Islands
[32/92] Belize
[33/92] Canada
[34/92] Costa Rica
[35/92] El Salvador
[36/92] Guatemala
[37/92] Honduras
[38/92] Mexico
[39/92] Nicaragua
[40/92] Panama
[41/92] St. Pierre and Miquelon
Failed St. Pierre and Miquelon - main: 404 Client Error: Not Found for url: https://www.noonsite.com/place/st.-pierre-and-miquelon/
Failed St. Pierre and Miquelon - clearance:

# Chunking

In [4]:
# chunk by tokens

import json

with open('noonsite_documents.json', 'r', encoding='utf-8') as f:
    documents = json.load(f)

def tokenize(text):
    return text.split()

# Sliding window 
def sliding_window_tokens(tokens, size, step):
    results = []
    for i in range(0, len(tokens), step):
        chunk_tokens = tokens[i:i+size]
        results.append({
            'start': i,
            'content': ' '.join(chunk_tokens)
        })
        if i + size >= len(tokens):
            break
    return results

# Chunk documents
def chunk_documents(documents, chunk_size=500, overlap=100):
    chunked = []
    
    for doc in documents:
        content = doc.get('content', '')
        tokens = tokenize(content)
        
        if not tokens:
            continue
        
        step = chunk_size - overlap
        chunks = sliding_window_tokens(tokens, size=chunk_size, step=step)
        
        for idx, chunk in enumerate(chunks):
            chunked_doc = {
                'id': f"{doc['id']}_chunk_{idx}",
                'country': doc['country'],
                'section_type': doc['section_type'],
                'url': doc['url'],
                'content': chunk['content'],
                'chunk_index': idx,
                'start_token': chunk['start']
            }
            chunked.append(chunked_doc)
    
    return chunked


chunked_data = chunk_documents(documents, chunk_size=500, overlap=100)
with open('noonsite_chunked.json', 'w', encoding='utf-8') as f:
    json.dump(chosen_chunked, f, indent=2, ensure_ascii=False)



In [5]:
# Indexing

from minsearch import Index

index = Index(
    text_fields=["country", "section_type", "content"],
)
index.fit(chunked_data)

<minsearch.minsearch.Index at 0x7c5301b05880>

In [9]:
search_results = index.search('How safe is Antigue & Barbuda?')
search_results[1]

{'id': 'antigua_&_barbuda_security_chunk_0',
 'country': 'Antigua & Barbuda',
 'section_type': 'security',
 'url': 'https://www.noonsite.com/place/antigua-barbuda/view/security/',
 'content': 'Antigua & Barbuda Security for Yachts Based on reports to Noonsite from cruisers, petty theft is on the rise in the Caribbean in general. Cruisers should take basic safety precautions and use common sense when ashore. Avoid walking alone in isolated areas, especially at night. Don’t carry large amounts of cash around or wear expensive jewelry. The risks of petty theft, pickpocketing and assault increases during regattas and festivals. Do not leave drinks unattended. Antigua has the lowest violent crime rate in the Caribbean and severe crime against tourists is rare. Dinghy thieves operate throughout the Caribbean. Be sure to lift, chain, and lock your dinghy and outboard when not in use, especially at night. When leaving your dinghy ashore, ensure that your dinghy and outboard are securely locked

In [11]:
# Semantic chunking - but i don't need this because its already saved and ingested into minsearch

def chunk_by_sections(documents):
    """Chunk by natural sections (headers, FAQs, etc)"""
    chunked = []
    
    for doc in documents:
        content = doc.get('content', '')
        
       
        sections = [s.strip() for s in content.split('\n\n') if s.strip()]
        
        current_chunk = []
        current_tokens = 0
        chunk_idx = 0
        
        for section in sections:
            section_tokens = len(section.split())
            
            
            if current_tokens + section_tokens > 500 and current_chunk:
                chunked_doc = {
                    'id': f"{doc['id']}_chunk_{chunk_idx}",
                    'country': doc['country'],
                    'section_type': doc['section_type'],
                    'url': doc['url'],
                    'content': '\n\n'.join(current_chunk),
                    'chunk_index': chunk_idx
                }
                chunked.append(chunked_doc)
                
                # Start new chunk with overlap (last section)
                current_chunk = [current_chunk[-1], section]
                current_tokens = len(current_chunk[-2].split()) + section_tokens
                chunk_idx += 1
            else:
                current_chunk.append(section)
                current_tokens += section_tokens
        
        # Save remaining content
        if current_chunk:
            chunked_doc = {
                'id': f"{doc['id']}_chunk_{chunk_idx}",
                'country': doc['country'],
                'section_type': doc['section_type'],
                'url': doc['url'],
                'content': '\n\n'.join(current_chunk),
                'chunk_index': chunk_idx
            }
            chunked.append(chunked_doc)
    
    return chunked

# Compare approaches
semantic_chunks = chunk_by_sections(documents)
print(f"Semantic chunking: {len(semantic_chunks)} chunks")

Semantic chunking: 273 chunks


# Indexing

In [14]:
from minsearch import Index

semantic_index = Index(
    text_fields=["country", "section_type", "content"]
)

semantic_index.fit(semantic_chunks)


query = "How safe is Antigua & Barbuda?"
results = semantic_index.search(query)

for r in results[:3]:  
    print(f"ID: {r['id']}")
    print(f"Country: {r['country']}")
    print(f"Section: {r['section_type']}")
    print(f"Content snippet: {r['content'][:300]}...\n")

ID: antigua_&_barbuda_main_chunk_0
Country: Antigua & Barbuda
Section: main
Content snippet: Antigua & Barbuda Facts for Sailors
Antigua and Barbuda is a state in the West Indies between the Caribbean Sea and Atlantic Ocean, consisting of the two main islands of Antigua and Barbuda and a number of smaller islands including the uninhabited Redonda Island. Barbuda is approx. 30NM north of Ant...

ID: antigua_&_barbuda_clearance_chunk_0
Country: Antigua & Barbuda
Section: clearance
Content snippet: Antigua & Barbuda Pre-Arrival Procedures for Yachts
For a full explanation of Caribbean web clearance services see reportEastern Caribbean Cruising: Clearance Procedures Simplified.
SeeBarbuda Clearance in Codringtonfor more information.
Antigua & Barbuda Arrival Procedures for Yachts
Codrington, Ba...

ID: antigua_&_barbuda_security_chunk_0
Country: Antigua & Barbuda
Section: security
Content snippet: Antigua & Barbuda Security for Yachts
Based on reports to Noonsite from cruisers, petty theft

In [12]:
chunked_data = chunk_documents(documents)
semantic_chunks = chunk_by_sections(documents)

In [13]:
print(len(chunked_data), "token-based chunks")
print(len(semantic_chunks), "semantic chunks")


420 token-based chunks
273 semantic chunks


I'm going to use Elasticsearch for persistant storage. And the data is also already ready to be processed by elasticsearch. Don't really need some of the above code. 

Add that code here