# HTML Parsing and Token Counting
This notebook fetches a web page, counts tokens before and after parsing with BeautifulSoup, and shows a snippet comparison.

In [1]:
from bs4 import BeautifulSoup
import requests
from token_count import TokenCount

# Initialize TokenCount
tc = TokenCount(model_name="gpt-4o-turbo")

In [2]:
# Fetch raw HTML
source = "https://x.com/CommitYourCode"  # CommitYourCode conference in Dallas
response = requests.get(source)
raw_text = response.text
raw_token_count = tc.num_tokens_from_string(raw_text)
print(f"Token count before parsing: {raw_token_count}")
print(f"raw_text: {raw_text[:100000]}...")  # Print first 1000 characters for brevity

Token count before parsing: 58988
raw_text: <!DOCTYPE html><html dir="ltr" lang="en"><head><meta charset="utf-8" /><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=0,viewport-fit=cover" /><link rel="preconnect" href="//abs.twimg.com" /><link rel="dns-prefetch" href="//abs.twimg.com" /><link rel="preconnect" href="//api.twitter.com" /><link rel="dns-prefetch" href="//api.twitter.com" /><link rel="preconnect" href="//api.x.com" /><link rel="dns-prefetch" href="//api.x.com" /><link rel="preconnect" href="//pbs.twimg.com" /><link rel="dns-prefetch" href="//pbs.twimg.com" /><link rel="preconnect" href="//t.co" /><link rel="dns-prefetch" href="//t.co" /><link rel="preconnect" href="//video.twimg.com" /><link rel="dns-prefetch" href="//video.twimg.com" /><link nonce="ZWJmNjdlZTktNDQ1OC00YjU1LWIzOTgtYzZjMzBmNDRjN2Zk" rel="preload" as="script" crossorigin="anonymous" href="https://abs.twimg.com/responsive-web/client-web-legacy/polyfills.45d1f09a.js

In [None]:
# Parse HTML and count tokens
soup = BeautifulSoup(response.content, "html.parser")
parsed_text = soup.get_text()
parsed_token_count = tc.num_tokens_from_string(parsed_text)
print(f"Token count after parsing: {parsed_token_count}")
print(f"parsed_text: {parsed_text[:1000]}...")  # Print first 1000 characters for brevity

In [None]:
# Show a before/after snippet (first 500 characters)
print("=== Before parsing (first 500 chars) ===")
print(raw_text[:500])
print("\n=== After parsing (first 500 chars) ===")
print(parsed_text[:500])