In [53]:
import trafilatura

In [54]:
url = "https://www.nhs.uk/conditions/baby/weaning-and-feeding/babys-first-solid-foods/"

In [55]:
html = trafilatura.fetch_url(url)
if html:
    print("HTML content fetched successfully!")
else:
    print("Failed to fetch HTML content.")




HTML content fetched successfully!


In [56]:
text = trafilatura.extract(html)

In [57]:
text[:1000]

"When to start introducing solid foods\nIntroducing your baby to solid foods, sometimes called complementary feeding or weaning, should start when your baby is around 6 months old.\nAt the beginning, how much your baby eats is less important than getting them used to the idea of eating.\nThey'll still be getting most of their energy and nutrients from breast milk or first infant formula.\nGiving your baby a variety of foods, alongside breast or formula milk, from around 6 months of age will help set your child up for a lifetime of healthier eating.\nGradually, you'll be able to increase the amount and variety of food your baby eats until they can eat the same foods as the rest of the family, in smaller portions.\nIf your baby was born prematurely, ask your health visitor or GP for advice on when to start introducing solid foods.\nWhy wait until around 6 months to introduce solids?\nIt's a good idea to wait until around 6 months before introducing solid foods because:\n- breast milk or 

In [58]:
from trafilatura import fetch_url, extract, settings
import json

# Define the URL
url = "https://www.nhs.uk/conditions/baby/weaning-and-feeding/babys-first-solid-foods/"

# Create a configuration and force link extraction
config = settings.use_config()
config.set("DEFAULT", "output-format", "json")
# Try setting include-links to "yes" or "full" to force extraction
config.set("DEFAULT", "include-links", "full")

# Fetch and extract the content
html = fetch_url(url)
result_json = extract(html, config=config, output_format="json")

data = json.loads(result_json) if result_json else {}
print("Extracted keys:", list(data.keys()))


Extracted keys: ['text', 'comments']


In [61]:
#not very polish code because I just need it to get the links for next script

import requests
from bs4 import BeautifulSoup

url = "https://www.nhs.uk/conditions/baby/weaning-and-feeding/"

# Fetch the page content
response = requests.get(url)
html = response.text

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Extract all <a> tags and filter for those with href attributes
all_links = [a.get("href") for a in soup.find_all("a") if a.get("href")]

# Filter to include only internal links (containing 'nhs.uk' or starting with '/')
internal_links = [link for link in all_links if "nhs.uk" in link or link.startswith("/")]
relevant_links = [link for link in all_links if "nhs.uk" in link or link.startswith("https://www.nhs.uk/conditions/baby/weaning-and-feeding/")]
print("Found links:")
for link in relevant_links:
    print(link)

# copy pasting the links like "https://www.nhs.uk/conditions/baby/weaning-and-feeding/...." from 
#this output to the next script

Found links:
https://www.nhs.uk/conditions/baby/weaning-and-feeding/babys-first-solid-foods/
https://www.nhs.uk/conditions/baby/weaning-and-feeding/help-your-baby-enjoy-new-foods/
https://www.nhs.uk/conditions/baby/weaning-and-feeding/baby-and-toddler-meal-ideas/
https://www.nhs.uk/conditions/baby/weaning-and-feeding/childrens-food-safety-and-hygiene/
https://www.nhs.uk/conditions/baby/weaning-and-feeding/drinks-and-cups-for-babies-and-young-children/
https://www.nhs.uk/conditions/baby/weaning-and-feeding/food-allergies-in-babies-and-young-children/
https://www.nhs.uk/conditions/baby/weaning-and-feeding/foods-to-avoid-giving-babies-and-young-children/
https://www.nhs.uk/conditions/baby/weaning-and-feeding/fussy-eaters/
https://www.nhs.uk/conditions/baby/weaning-and-feeding/vitamins-for-children/
https://www.nhs.uk/conditions/baby/weaning-and-feeding/what-to-feed-young-children/
https://www.nhs.uk/conditions/baby/weaning-and-feeding/young-children-and-food-common-questions/


In [27]:
import os
import re
import json
import logging
from datetime import datetime
from urllib.parse import urljoin

import trafilatura

# Optional: Use nltk for sentence tokenization if available
try:
    import nltk
    nltk.download('punkt', quiet=True)
    from nltk.tokenize import sent_tokenize
    USE_SENT_TOKENIZATION = True
except ImportError:
    USE_SENT_TOKENIZATION = False

# === SETTINGS ===
DATA_DIR = "data/raw"  # Store raw scraped data separately
os.makedirs(DATA_DIR, exist_ok=True)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

# === HELPERS ===

def fetch_clean_text(url):
    """
    Fetches HTML using Trafilatura and extracts the main text with links included.
    Raises an error if the URL cannot be fetched.
    """
    html = trafilatura.fetch_url(url)
    if html is None:
        raise ValueError(f"Could not fetch: {url}")
    extracted = trafilatura.extract(html, include_links=True)
    return normalize_links(extracted, base_url=url)

def normalize_links(text, base_url):
    """
    Normalizes relative links by converting them into absolute URLs.
    """
    link_pattern = re.compile(r'\((\/[^\)]+)\)')
    return re.sub(link_pattern, lambda m: f'({urljoin(base_url, m.group(1))})', text)

def estimate_tokens(text):
    """
    Provides a rough token count estimate.
    """
    return int(len(text.split()) * 1.3)

def chunk_text(text, max_words=100, overlap=20, use_sentence_splitting=False):
    """
    Splits text into chunks.
    
    - If `use_sentence_splitting` is True and nltk is available, split by sentences 
      ensuring chunks have roughly max_words with overlap.
    - Otherwise, do a simple word-based split.
    """
    if use_sentence_splitting and USE_SENT_TOKENIZATION:
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_length = 0
        for sentence in sentences:
            sentence_word_count = len(sentence.split())
            # If adding this sentence exceeds max_words and current_chunk is not empty,
            # finalize the current chunk.
            if current_length + sentence_word_count > max_words and current_chunk:
                chunks.append(" ".join(current_chunk))
                # Start new chunk with overlap from the end of the previous chunk
                overlap_text = " ".join(current_chunk[-overlap:]) if overlap < len(current_chunk) else " ".join(current_chunk)
                current_chunk = overlap_text.split()
                current_length = len(current_chunk)
            current_chunk.append(sentence)
            current_length += sentence_word_count
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        return chunks
    else:
        # Simple word-based splitting
        words = text.split()
        chunks = []
        start = 0
        while start < len(words):
            end = start + max_words
            chunk = " ".join(words[start:end])
            chunks.append(chunk)
            start += max_words - overlap
        return chunks

def extract_sections(text):
    """
    Splits text into sections based on heuristics for section headers:
    - A short line (<= 8 words)
    - Starts with an uppercase letter
    - Contains no punctuation like .?!:
    """
    sections = []
    lines = text.split("\n")
    current_section = "Unknown"
    buffer = []

    for line in lines:
        stripped = line.strip()
        if stripped and len(stripped.split()) <= 8 and stripped[0].isupper() and not any(p in stripped for p in ".?!:"):
            if buffer:
                sections.append((current_section, "\n".join(buffer).strip()))
                buffer = []
            current_section = stripped
        else:
            buffer.append(line)

    if buffer:
        sections.append((current_section, "\n".join(buffer).strip()))
    return sections

def process_url(url, title, output_filename, max_words=100, overlap=20, use_sentence_splitting=False):
    """
    Processes a single URL:
      - Fetches and cleans the text
      - Extracts sections and chunks them
      - Adds metadata and saves to a JSON file.
    """
    logging.info(f"Processing URL: {url}")
    try:
        clean_text = fetch_clean_text(url)
    except Exception as e:
        logging.error(f"Error fetching or cleaning text from {url}: {e}")
        return []
    
    timestamp = datetime.utcnow().isoformat()
    section_blocks = extract_sections(clean_text)
    all_chunks = []

    for section, sec_text in section_blocks:
        chunks = chunk_text(sec_text, max_words=max_words, overlap=overlap, use_sentence_splitting=use_sentence_splitting)
        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "chunk": chunk,
                "section": section,
                "source_url": url,
                "title": title,
                "chunk_id": f"{title.replace(' ', '_').lower()}_{section.replace(' ', '_').lower()}_{i}",
                "scraped_at": timestamp,
                "tokens": estimate_tokens(chunk)
            })
    
    output_path = os.path.join(DATA_DIR, output_filename)
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(all_chunks, f, indent=2, ensure_ascii=False)
        logging.info(f"Saved {len(all_chunks)} chunks to {output_path}")
    except Exception as e:
        logging.error(f"Error saving file {output_path}: {e}")
    return all_chunks

# === BATCH PROCESSING ===
urls = {
    "first_solid_food": "https://www.nhs.uk/conditions/baby/weaning-and-feeding/babys-first-solid-foods/",
}

def batch_process_urls(url_map, max_words=100, overlap=20, use_sentence_splitting=False):
    for key, url in url_map.items():
        title = key.replace("_", " ").title()
        filename = f"{key}.json"
        process_url(url, title=title, output_filename=filename, 
                    max_words=max_words, overlap=overlap, use_sentence_splitting=use_sentence_splitting)

# Run batch processing (adjust parameters as needed)
if __name__ == "__main__":
    batch_process_urls(urls, max_words=150, overlap=30, use_sentence_splitting=True)


2025-04-03 19:37:19,764 INFO: Processing URL: https://www.nhs.uk/conditions/baby/weaning-and-feeding/babys-first-solid-foods/
2025-04-03 19:37:19,947 INFO: Saved 32 chunks to data/raw/first_solid_food.json


In [36]:
url = "https://www.nhs.uk/conditions/baby/weaning-and-feeding/babys-first-solid-foods/"

In [37]:
html = trafilatura.fetch_url(url)

In [38]:
text = trafilatura.extract(html, include_links=True)

In [39]:
text

'When to start introducing solid foods\nIntroducing your baby to solid foods, sometimes called complementary feeding or weaning, should start when your baby is around 6 months old.\nAt the beginning, how much your baby eats is less important than getting them used to the idea of eating.\nThey\'ll still be getting most of their energy and nutrients from breast milk or first infant formula.\nGiving your baby a variety of foods, alongside breast or formula milk, from around 6 months of age will help set your child up for a lifetime of healthier eating.\nGradually, you\'ll be able to increase the amount and variety of food your baby eats until they can eat the same foods as the rest of the family, in smaller portions.\nIf your baby was born prematurely, ask your health visitor or GP for advice on when to start introducing solid foods.\nWhy wait until around 6 months to introduce solids?\nIt\'s a good idea to wait until around 6 months before introducing solid foods because:\n- breast milk 

﻿<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<meta content="How to move on to solid foods and drinks, meal ideas and food safety." name="description"/>
<meta content="a4yrvgi5ZlBnKWfqFKkQ3_mEjqow_fpwbtF2bUTmZgc" name="google-site-verification">
<link href="https://www.nhs.uk/conditions/baby/weaning-and-feeding/" rel="canonical"/>
<title>Weaning and feeding - NHS</title>
<link crossorigin="" href="https://assets.nhs.uk/" rel="preconnect"/>
<link as="font" crossorigin="" href="https://assets.nhs.uk/fonts/FrutigerLTW01-55Roman.woff2" rel="preload" type="font/woff2"/>
<link as="font" crossorigin="" href="https://assets.nhs.uk/fonts/FrutigerLTW01-65Bold.woff2" rel="preload" type="font/woff2"/>
<link href="/static/nhsuk/css/main.65c2224cbdec.css" rel="stylesheet" type="text/css">
<script type="application/ld+json">{"@context": "ht

In [50]:
import sys
print(sys.executable)

/usr/local/opt/python@3.11/bin/python3.11
