# Web Content Parser

This notebook extracts text content from web pages given URLs and outputs the results as `url, content_string` pairs.

In [1]:
# Install required packages (run this cell first if packages are not installed)
!pip install requests beautifulsoup4 pandas lxml



In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin, urlparse
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
def extract_web_content(url, delay=1, timeout=15):
    """
    Extract text content from a web page
    
    Args:
        url (str): URL to parse
        delay (int): Delay between requests in seconds
        timeout (int): Request timeout in seconds
        
    Returns:
        dict: Dictionary containing url and content_string
    """
    result = {
        'url': url,
        'content_string': '',
        'status': 'success',
        'error': None
    }
    
    try:
        print(f"Fetching content from: {url}")
        
        # Set headers to mimic a real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        
        # Make the request
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style", "nav", "footer", "header"]):
            script.decompose()
        
        # Extract text content
        # Try to get main content areas first
        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main', re.I))
        
        if main_content:
            content_text = main_content.get_text()
        else:
            # Fall back to body content
            content_text = soup.get_text()
        
        # Clean the extracted text
        # Remove extra whitespace and newlines
        content_text = re.sub(r'\s+', ' ', content_text)
        content_text = content_text.strip()
        
        result['content_string'] = content_text
        
        print(f"Successfully extracted {len(result['content_string'])} characters")
        
    except requests.exceptions.RequestException as e:
        result['status'] = 'error'
        result['error'] = f"Request error: {str(e)}"
        print(f"Request error: {e}")
        
    except Exception as e:
        result['status'] = 'error'
        result['error'] = f"Parsing error: {str(e)}"
        print(f"Parsing error: {e}")
    
    # Add delay to be respectful to the server
    time.sleep(delay)
    
    return result

## Parse Single URL

In [4]:
# Test with the provided URL
test_url = "https://www.cmegroup.com/markets/equities/sp/e-mini-sandp500.contractSpecs.options.html"

# Extract content
result = extract_web_content(test_url)

# Display results
print(f"\nURL: {result['url']}")
print(f"Status: {result['status']}")

if result['status'] == 'success':
    print(f"Content Length: {len(result['content_string'])} characters")
    print(f"\nContent Preview (first 500 characters):")
    print(result['content_string'][:500] + "..." if len(result['content_string']) > 500 else result['content_string'])
else:
    print(f"Error: {result['error']}")

Fetching content from: https://www.cmegroup.com/markets/equities/sp/e-mini-sandp500.contractSpecs.options.html
Successfully extracted 19322 characters

URL: https://www.cmegroup.com/markets/equities/sp/e-mini-sandp500.contractSpecs.options.html
Status: success
Content Length: 19322 characters

Content Preview (first 500 characters):
Capitalize on the around-the-clock liquidity of E-mini S&P 500 futures (ES), and take advantage of one of the most efficient and cost-effective ways to gain market exposure to the S&P 500 Index, a broad-based, capitalization-weighted index that tracks 500 of the largest companies of the US economy and a key indicator of the stock market’s health. With ES futures, you can take positions on S&P 500 performance electronically. Capitalize on the around-the-clock liquidity of E-mini S&P 500 futures (...


## Create DataFrame and Save Results

In [5]:
# Create DataFrame with the result
if result['status'] == 'success':
    df = pd.DataFrame([{
        'url': result['url'],
        'content_string': result['content_string']
    }])
    
    print("DataFrame created:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Display DataFrame info
    print("\nDataFrame info:")
    print(df.info())
    
    # Save to CSV
    output_file = 'extracted_web_content.csv'
    df.to_csv(output_file, index=False)
    print(f"\nData saved to: {output_file}")
else:
    print("Cannot create DataFrame due to extraction error")

DataFrame created:
Shape: (1, 2)
Columns: ['url', 'content_string']

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   url             1 non-null      object
 1   content_string  1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes
None

Data saved to: extracted_web_content.csv


## Parse Multiple URLs

In [7]:
# Example with multiple URLs
urls_to_parse = [
    "https://www.cmegroup.com/markets/equities/sp/e-mini-sandp500.contractSpecs.options.html",
    "https://www.cmegroup.com/markets/equities/sp/e-mini-sandp500.html",
    # Add more URLs as needed
]

def parse_multiple_urls(urls, delay=2):
    """
    Parse multiple URLs and return DataFrame
    """
    results = []
    
    for i, url in enumerate(urls, 1):
        print(f"\nProcessing URL {i}/{len(urls)}")
        result = extract_web_content(url, delay=delay)
        
        results.append({
            'url': result['url'],
            'content_string': result['content_string'] if result['status'] == 'success' else '',
            'status': result['status'],
            'content_length': len(result['content_string']) if result['status'] == 'success' else 0
        })
    
    return pd.DataFrame(results)

# Parse multiple URLs (uncomment to run)
# multi_df = parse_multiple_urls(urls_to_parse)
# print("\nMultiple URLs Results:")
# print(multi_df[['url', 'status', 'content_length']])
# multi_df.to_csv('multiple_urls_content.csv', index=False)
# print("Results saved to: multiple_urls_content.csv")

## Parse URLs from CSV File

In [10]:
def parse_urls_from_csv(csv_file, url_column='url'):
    """
    Read URLs from CSV file and extract content
    """
    try:
        # Read URLs from CSV
        urls_df = pd.read_csv(csv_file)
        print(f"Loaded CSV with {len(urls_df)} rows")
        print(f"Columns: {list(urls_df.columns)}")
        
        if url_column not in urls_df.columns:
            print(f"Error: Column '{url_column}' not found in CSV")
            return None
        
        urls_list = urls_df[url_column].dropna().tolist()
        print(f"Found {len(urls_list)} URLs to parse")
        
        # Parse URLs
        results_df = parse_multiple_urls(urls_list)
        
        # Save results
        output_file = 'parsed_content_from_csv.csv'
        results_df.to_csv(output_file, index=False)
        print(f"\nResults saved to: {output_file}")
        
        return results_df
        
    except FileNotFoundError:
        print(f"Error: CSV file '{csv_file}' not found")
        return None
    except Exception as e:
        print(f"Error processing CSV: {e}")
        return None



In [None]:
# Example usage (uncomment and update path to use)
csv_results = parse_urls_from_csv('cme_pages_url.csv', 'url')
if csv_results is not None:
   # print(csv_results.head())
   csv_results.to_csv('parsed_content.csv', index=False)

Error: CSV file 'cme_pages_urls.csv' not found


## Content Analysis

In [None]:
# Analyze the extracted content
if 'df' in locals() and not df.empty:
    content = df['content_string'].iloc[0]
    
    print("Content Analysis:")
    print(f"Total characters: {len(content)}")
    print(f"Total words: {len(content.split())}")
    print(f"Total lines: {len(content.split('\n'))}")
    
    # Find common words (basic analysis)
    words = content.lower().split()
    word_freq = {}
    for word in words:
        word = re.sub(r'[^\w]', '', word)  # Remove punctuation
        if len(word) > 3:  # Only words longer than 3 characters
            word_freq[word] = word_freq.get(word, 0) + 1
    
    # Top 10 most frequent words
    top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
    print("\nTop 10 most frequent words:")
    for word, count in top_words:
        print(f"  {word}: {count}")
else:
    print("No content available for analysis")