# Tokenization

In [67]:
%pip install nltk

import nltk
nltk.download('punkt_tab')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/arantarokade/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Download Corpus

In [68]:
import os
# Use absolute path to project
PROJECT_DIR = '/Users/arantarokade/Documents/speedrun-neural-networks'
OUTPUT_DIR = f'{PROJECT_DIR}/output/tokens'
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory: {OUTPUT_DIR}")

def dump_data(file_name, data):
    with open(file_name, 'w', encoding='utf-8') as f:
        f.write(data)

Output directory: /Users/arantarokade/Documents/speedrun-neural-networks/output/tokens


In [69]:
# Download the 20 Newsgroups dataset - widely used for NLP evaluation
from sklearn.datasets import fetch_20newsgroups

# Select categories with domain-specific language, emails, technical jargon
categories = ['comp.sys.ibm.pc.hardware']

newsgroups = fetch_20newsgroups(
    subset='train',
    categories=categories,
    remove=(),  # Keep headers, footers, quotes - contains email addresses, signatures
    random_state=42
)

# Combine multiple posts for a rich sample
text = '\n\n'.join(newsgroups.data)
dump_data(os.path.join(OUTPUT_DIR, '20NewsGroupsDataset-comp.sys.ibm.pc.hardware.txt'), text)

print("=" * 80)
print("CORPUS: 20 Newsgroups Dataset")
print("DESCRIPTION: Email-style Usenet posts with headers, signatures, quotes")
print(f"CATEGORIES: {', '.join(categories)}")
print(f"SAMPLE SIZE: {len(text):,} characters from {len(newsgroups.data[:50])} posts")
print("FEATURES: Email addresses, apostrophes, technical jargon, varied punctuation")
print("REFERENCE: http://qwone.com/~jason/20Newsgroups/")
print("CITATION: Lang, K. (1995). Newsweeder: Learning to filter netnews.")
print("          Proceedings of ICML-95.")
print("=" * 80)
print("\nSample text (first 1000 chars):\n")
print(text[:1000])
print("\n" + "=" * 80)

CORPUS: 20 Newsgroups Dataset
DESCRIPTION: Email-style Usenet posts with headers, signatures, quotes
CATEGORIES: comp.sys.ibm.pc.hardware
SAMPLE SIZE: 799,264 characters from 50 posts
FEATURES: Email addresses, apostrophes, technical jargon, varied punctuation
REFERENCE: http://qwone.com/~jason/20Newsgroups/
CITATION: Lang, K. (1995). Newsweeder: Learning to filter netnews.
          Proceedings of ICML-95.

Sample text (first 1000 chars):

From: badry@cs.UAlberta.CA (Badry Jason Theodore)
Subject: Chaining IDE drives
Summary: Trouble with Master/Slave drives
Nntp-Posting-Host: cab009.cs.ualberta.ca
Organization: University Of Alberta, Edmonton Canada
Lines: 16

Hi.  I am trying to set up a Conner 3184 and a Quantum 80AT drive.  I have
the conner set to the master, and the quantum set to the slave (doesn't work
the other way around).  I am able to access both drives if I boot from a 
floppy, but the drives will not boot themselves.  I am running MSDOS 6, and
have the Conner partitioned

## Tokenize!

In [70]:
import time
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer, TreebankWordTokenizer, RegexpTokenizer
import os
import pandas as pd

# Define tokenizers to compare
tokenizers = {
    'sent_tokenize': lambda t: sent_tokenize(t),
    'word_tokenize': lambda t: word_tokenize(t),
    'WordPunctTokenizer': lambda t: WordPunctTokenizer().tokenize(t),
    'TreebankWordTokenizer': lambda t: TreebankWordTokenizer().tokenize(t),
    'RegexpTokenizer (\\w+)': lambda t: RegexpTokenizer(r'\w+').tokenize(t),
}

# Multiple iterations to record accurate timing
n_iterations = 10

results = []
original_chars = len(text)

for name, tokenizer_func in tokenizers.items():
    # Measure processing time (average over multiple runs)
    start_time = time.perf_counter()
    for itr in range(n_iterations):
        tokens = tokenizer_func(text)
        # dump tokens for inspection
        os.makedirs(os.path.join(OUTPUT_DIR, name), exist_ok=True)
        dump_data(os.path.join(OUTPUT_DIR, name, f'tokens_{itr}.txt'), '\n'.join(tokens))
    end_time = time.perf_counter()
    
    avg_time_ms = ((end_time - start_time) / n_iterations) * 1000
    
    # Calculate metrics
    token_count = len(tokens)
    vocab_size = len(set(tokens))
    avg_token_len = sum(len(t) for t in tokens) / token_count if token_count > 0 else 0
    
    # Compression Ratio = Original characters / Token count
    # Higher ratio means fewer tokens to represent the same text
    compression_ratio = original_chars / token_count if token_count > 0 else 0
    
    # Type-Token Ratio (lexical diversity)
    ttr = vocab_size / token_count if token_count > 0 else 0
    
    results.append({
        'Tokenizer': name,
        'Token Count': token_count,
        'Vocab Size': vocab_size,
        'Avg Token Length': round(avg_token_len, 2),
        'Processing Time (ms)': round(avg_time_ms, 2),
        'Compression Ratio': round(compression_ratio, 2),
        'Type-Token Ratio': round(ttr, 4),
    })

# Create DataFrame and display
df = pd.DataFrame(results)
print(f"Corpus size: {original_chars:,} characters\n")
print(df.to_string(index=False))

Corpus size: 799,264 characters

            Tokenizer  Token Count  Vocab Size  Avg Token Length  Processing Time (ms)  Compression Ratio  Type-Token Ratio
        sent_tokenize         6496        6120            120.69                 76.39             123.04            0.9421
        word_tokenize       163828       15256              3.86                416.20               4.88            0.0931
   WordPunctTokenizer       178238       14819              3.54                 37.20               4.48            0.0831
TreebankWordTokenizer       156431       16604              4.04                234.49               5.11            0.1061
RegexpTokenizer (\w+)       133099       14237              4.24                 23.95               6.01            0.1070


## Analysis

### Emails

In [71]:
# get emails from corpus
import re
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(email_pattern, text)
unique_emails = set(emails)
print(f"\nExtracted {len(unique_emails)} unique email addresses:\n")
print(f'First 10 emails: {"\n".join(list(unique_emails)[:10])}')


Extracted 1118 unique email addresses:

First 10 emails: hamilton@osiris.cso.uiuc.edu
1qsa97INNm7b@dns1.NMSU.Edu
michael@jester.GUN.de
CASEY.93Apr9115458@grace.wharton.upenn.edu
JMARTTILA@FINABO.ABO.FI
SEXTON@CLAES.SPACE.LOCKHEED.COM
ladanyi@cs.cornell.edu
egzondag@prl.philips.nl
k4bnc@cbnewsh.att.com
ryvg90@email.sps.mot.com


In [75]:
emails_dict = {}
for dir in os.listdir(OUTPUT_DIR):
    dir_path = os.path.join(OUTPUT_DIR, dir)
    if os.path.isdir(dir_path):
        print(f"\nChecking tokenizer: {dir}")
        emails_found = set()
        # check first file in the directory
        file = os.listdir(dir_path)[0]
        file_path = os.path.join(dir_path, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            tokens = f.read().splitlines()
            for token in tokens:
                if re.match(email_pattern, token):
                    emails_found.add(token)
        print(f"  Found {len(emails_found)} unique email addresses in tokens.")
        print(f"  Sample emails: ", list(emails_found)[:5])
        emails_dict[dir] = emails_found


Checking tokenizer: RegexpTokenizer (\w+)
  Found 139 unique email addresses in tokens.
  Sample emails:  ['huot@cray.com ', "kevin@kosman.uucp (Kevin O'Gorman) writes:", 'egzondag@prl.philips.nl', 'k4bnc@cbnewsh.att.com', 'satam@saathi.ernet.in']

Checking tokenizer: WordPunctTokenizer
  Found 0 unique email addresses in tokens.
  Sample emails:  []

Checking tokenizer: sent_tokenize
  Found 147 unique email addresses in tokens.
  Sample emails:  ['huot@cray.com ', '+----s913579@minyos.xx.rmit.OZ.AU---Royal Melbourne Institute of Technology---+', "kevin@kosman.uucp (Kevin O'Gorman) writes:", 'egzondag@prl.philips.nl', 'k4bnc@cbnewsh.att.com']

Checking tokenizer: .ipynb_checkpoints
  Found 139 unique email addresses in tokens.
  Sample emails:  ['huot@cray.com ', "kevin@kosman.uucp (Kevin O'Gorman) writes:", 'egzondag@prl.philips.nl', 'k4bnc@cbnewsh.att.com', 'satam@saathi.ernet.in']

Checking tokenizer: word_tokenize
  Found 0 unique email addresses in tokens.
  Sample emails:  []



### Contractions

In [77]:
contractions = ["don't", "can't", "won't", "I'm", "it's", "they're", "you're", "we're", "isn't", "aren't", "couldn't", "shouldn't", "wouldn't", "didn't", "hasn't", "haven't", "doesn't", "there's", "here's", "that's", "you'll", "we'll", "they'll", "she's", "he's", "y'all"]

contractions_dict = {}

for dir in os.listdir(OUTPUT_DIR):
    dir_path = os.path.join(OUTPUT_DIR, dir)
    if os.path.isdir(dir_path):
        print(f"\nChecking tokenizer: {dir}")
        contractions_found = set()
        # check first file in the directory
        file = os.listdir(dir_path)[0]
        file_path = os.path.join(dir_path, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            tokens = f.read().splitlines()
            for token in tokens:
                if token in contractions:
                    contractions_found.add(token)
        print(f"  Found {len(contractions_found)} unique contractions in tokens.")
        print(f"  Sample contractions: ", list(contractions_found)[:5])
        contractions_dict[dir] = contractions_found


Checking tokenizer: RegexpTokenizer (\w+)
  Found 0 unique contractions in tokens.
  Sample contractions:  []

Checking tokenizer: WordPunctTokenizer
  Found 0 unique contractions in tokens.
  Sample contractions:  []

Checking tokenizer: sent_tokenize
  Found 2 unique contractions in tokens.
  Sample contractions:  ["haven't", "I'm"]

Checking tokenizer: .ipynb_checkpoints
  Found 0 unique contractions in tokens.
  Sample contractions:  []

Checking tokenizer: word_tokenize
  Found 0 unique contractions in tokens.
  Sample contractions:  []

Checking tokenizer: TreebankWordTokenizer
  Found 14 unique contractions in tokens.
  Sample contractions:  ["aren't", "doesn't", "you're", "haven't", "I'm"]


### Top Tokens

In [78]:
# get the top 10 most common tokens from each tokenizer
from collections import Counter
for dir in os.listdir(OUTPUT_DIR):
    dir_path = os.path.join(OUTPUT_DIR, dir)
    if os.path.isdir(dir_path):
        print(f"\nChecking tokenizer: {dir}")
        token_counter = Counter()
        # check first file in the directory
        file = os.listdir(dir_path)[0]
        file_path = os.path.join(dir_path, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            tokens = f.read().splitlines()
            token_counter.update(tokens)
        most_common = token_counter.most_common(10)
        print(f"  Top 10 most common tokens:")
        for token, count in most_common:
            print(f"    '{token}': {count} occurrences")



Checking tokenizer: RegexpTokenizer (\w+)
  Top 10 most common tokens:
    '': 4955 occurrences
    '>': 191 occurrences
    '-- ': 154 occurrences
    'Distribution: world': 59 occurrences
    '--': 58 occurrences
    '> ': 54 occurrences
    '          ': 46 occurrences
    ' ': 40 occurrences
    'Distribution: usa': 39 occurrences
    'Subject: Re: IDE vs SCSI': 33 occurrences

Checking tokenizer: WordPunctTokenizer
  Top 10 most common tokens:
    '.': 11502 occurrences
    ':': 5140 occurrences
    ',': 4942 occurrences
    'the': 4349 occurrences
    '-': 3541 occurrences
    'I': 2804 occurrences
    '(': 2457 occurrences
    'a': 2338 occurrences
    'to': 2313 occurrences
    '>': 2143 occurrences

Checking tokenizer: sent_tokenize
  Top 10 most common tokens:
    '': 2938 occurrences
    '>': 191 occurrences
    '-- ': 154 occurrences
    'Distribution: world': 59 occurrences
    '--': 58 occurrences
    '> ': 54 occurrences
    'Distribution: usa': 39 occurrences
    'Subj