In [None]:
pip install requests beautifulsoup4

In [1]:
import requests
from bs4 import BeautifulSoup
import time

In [2]:
# Step 1: Scrape the Webpage
url = 'https://en.wikipedia.org/wiki/Hanford_Engineer_Works'
response = requests.get(url)

In [3]:
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    text_content = soup.get_text()
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
    text_content = ""

In [4]:
# Step 2: Define Chunking Methods
def fixed_length_chunking(text, chunk_size):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def semantic_chunking(text, start):
    adjusted_text = text[start:]
    paragraphs = adjusted_text.split('\n\n')
    return paragraphs

def overlapping_chunking(text, chunk_size, overlap):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]

In [5]:
# Define a simple function to calculate coherence using sentence boundaries
def calculate_sentence_boundary_coherence(chunk):
    return chunk.count('.') + chunk.count('!') + chunk.count('?')

In [15]:
# Step 3: Experiment with Different Chunk Sizes and Methods
chunk_sizes = [256, 512, 1024, 2048]
overlap = 50  # Example overlap for overlapping chunking
start_position = 5000  # Example start position for semantic chunking

chunking_methods = {
    'Fixed-Length': fixed_length_chunking,
    'Semantic': lambda text, size: semantic_chunking(text, start_position),
    'Overlapping': overlapping_chunking
}

results = {}

for method_name, chunking_function in chunking_methods.items():
    results[method_name] = {}
    for size in chunk_sizes:
        if method_name == 'Fixed-Length':
            chunks = chunking_function(text_content, size)
        elif method_name == 'Overlapping':
            chunks = chunking_function(text_content, size, overlap)
        elif method_name == 'Semantic':
            chunks = chunking_function(text_content, start_position)
        
        start_time = time.time()
        
        # Perform the task on each chunk and collect metrics
        coherence_scores = [calculate_sentence_boundary_coherence(chunk) for chunk in chunks]
        
        end_time = time.time()
        processing_time = (end_time - start_time) * 1000 
        
        # Store the results
        results[method_name][size] = {
            'num_chunks': len(chunks),
            'total_coherence': sum(coherence_scores),
            'avg_coherence': sum(coherence_scores) / len(coherence_scores) if chunks else 0,
            'processing_time': processing_time
        }

In [17]:
# Step 4: Evaluate Results
for method_name, method_results in results.items():
    print(f"Chunking Method: {method_name}")
    for size, metrics in method_results.items():
        print(f"  Chunk Size: {size}")
        print(f"    Number of Chunks: {metrics['num_chunks']}")
        print(f"    Total Coherence: {metrics['total_coherence']}")
        print(f"    Average Coherence: {metrics['avg_coherence']:.2f}")
        print(f"    Processing Time: {metrics['processing_time']:.2f} ms")
    print("\n")

Chunking Method: Fixed-Length
  Chunk Size: 256
    Number of Chunks: 372
    Total Coherence: 1478
    Average Coherence: 3.97
    Processing Time: 0.58 ms
  Chunk Size: 512
    Number of Chunks: 186
    Total Coherence: 1478
    Average Coherence: 7.95
    Processing Time: 0.45 ms
  Chunk Size: 1024
    Number of Chunks: 93
    Total Coherence: 1478
    Average Coherence: 15.89
    Processing Time: 0.35 ms
  Chunk Size: 2048
    Number of Chunks: 47
    Total Coherence: 1478
    Average Coherence: 31.45
    Processing Time: 0.29 ms


Chunking Method: Semantic
  Chunk Size: 256
    Number of Chunks: 290
    Total Coherence: 1440
    Average Coherence: 4.97
    Processing Time: 0.52 ms
  Chunk Size: 512
    Number of Chunks: 290
    Total Coherence: 1440
    Average Coherence: 4.97
    Processing Time: 0.32 ms
  Chunk Size: 1024
    Number of Chunks: 290
    Total Coherence: 1440
    Average Coherence: 4.97
    Processing Time: 0.25 ms
  Chunk Size: 2048
    Number of Chunks: 290
    T