In [None]:
#!/usr/bin/env python3
import os
import random
import subprocess
import fasttext
from typing import Optional, List
from warcio.archiveiterator import ArchiveIterator

from cs336_data.harmful import classify_nsfw, classify_toxic_speech
from cs336_data.language_id import identify_language
from cs336_data.parsing import extract_text_from_html_bytes
from cs336_data.privacy import mask_emails, mask_ips, mask_phone_numbers
from cs336_data.quality import gopher_quality_filter

def subsample_urls(input_file: str, output_file: str, sample_size: int) -> None:
    """
    Subsample URLs from input file and write to output file.
    
    Args:
        input_file: Path to input file containing URLs
        output_file: Path to output file to write subsampled URLs
        sample_size: Number of URLs to sample
    """
    with open(input_file, 'r') as f:
        urls = f.readlines()
    
    # Subsample
    sampled_urls = random.sample(urls, min(sample_size, len(urls)))
    
    with open(output_file, 'w') as f:
        f.writelines(sampled_urls)
    
    print(f"Subsampled {len(sampled_urls)} URLs from {len(urls)} total URLs")


def fetch_urls(url_file: str, warc_file: str) -> None:
    """
    Fetch URLs using wget and save to WARC file.
    
    Args:
        url_file: Path to file containing URLs to fetch
        warc_file: Path to output WARC file
    """
    cmd = f"wget --timeout=5 -i {url_file} --tries=3 --warc-file={warc_file} -O /dev/null"
    print(f"Running command: {cmd}")
    
    try:
        subprocess.run(cmd, shell=True, check=True)
        print(f"Successfully fetched URLs and saved to {warc_file}.warc.gz")
    except subprocess.CalledProcessError as e:
        print(f"Error fetching URLs: {e}")


def extract_wiki_texts_from_warc(warc_file: str, extract_text_from_html_bytes, quality_filter) -> List[tuple]:
    """
    Extract texts from WARC file and apply quality filter.
    
    Args:
        warc_file: Path to WARC file
        extract_text_from_html_bytes: Function to extract text from HTML bytes
        gopher_quality_filter: Function to filter quality texts
    
    Returns:
        List of tuples (text, label)
    """
    warc_path = f"{warc_file}.warc.gz"
    texts = []
    
    with open(warc_path, 'rb') as f:
        for record in ArchiveIterator(f):
            if record.rec_type == 'response' and record.http_headers.get_header('Content-Type', '').startswith('text/html'):
                html_bytes = record.content_stream().read()
                text = extract_text_from_html_bytes(html_bytes)
                
                if text:
                    # Apply quality filter and assign label
                    is_quality = quality_filter(text)
                    if is_quality:
                        texts.append((text, "__label__wiki"))
    
    print(f"Extracted {len(texts)} quality text samples from WARC file")
    return texts


def extract_cc_texts_from_warc(warc_file: str, extract_text_from_html_bytes) -> List[tuple]:
    """
    Extract texts from WARC file.
    
    Args:
        warc_file: Path to WARC file
        extract_text_from_html_bytes: Function to extract text from HTML bytes
    
    Returns:
        List of tuples (text, label)
    """
    warc_path = f"{warc_file}.warc.gz"
    texts = []
    
    with open(warc_path, 'rb') as f:
        for record in ArchiveIterator(f):
            if record.rec_type == 'response' and record.http_headers.get_header('Content-Type', '').startswith('text/html'):
                html_bytes = record.content_stream().read()
                text = extract_text_from_html_bytes(html_bytes)
                
                if text:
                    texts.append((text, "__label__cc"))
    
    print(f"Extracted {len(texts)} cc text samples from WARC file")
    return texts


def prepare_fasttext_data(texts: List[tuple], output_file: str) -> None:
    """
    Prepare data for FastText training.
    
    Args:
        texts: List of tuples (text, label)
        output_file: Path to output file for FastText training
    """
    with open(output_file, 'w') as f:
        for text, label in texts:
            text_line = text.replace('\n', ' ').strip()
            f.write(f"{label} {text_line}\n")
    
    print(f"Prepared FastText training data with {len(texts)} samples")

In [None]:
# Define variables
input_url_file = "../data/enwiki-20240420-extracted_urls.txt"
subsampled_url_file = "../processed/subsampled_urls.txt"
wiki_warc_file = "../processed/subsampled_urls"
cc_warc_file = "../data/CC-MAIN-20180420081400-20180420101400-00118"
fasttext_data_file = "../processed/fasttext_training_data.txt"
model_file = "../models/wikipedia_quality_model.bin"
sample_size = 1000  # Number of URLs to sample

# Subsample URLs
subsample_urls(input_url_file, subsampled_url_file, sample_size)

# Fetch URLs using wget
fetch_urls(subsampled_url_file, warc_file = wiki_warc_file)

# Extract texts from WARC file
wiki_texts = extract_wiki_texts_from_warc(wiki_warc_file, extract_text_from_html_bytes, gopher_quality_filter)
cc_texts = extract_cc_texts_from_warc(cc_warc_file, extract_text_from_html_bytes)

# Balance the number of bad data to be in the ballpark of the good data
trimmed_cc_len = min(len(wiki_texts)*2, len(cc_texts))
cc_texts = cc_texts[:trimmed_cc_len]
texts = wiki_texts
texts.extend(cc_texts)

# Prepare FastText training data
prepare_fasttext_data(texts, fasttext_data_file)

Extracted 321 quality text samples from WARC file
Extracted 47823 cc text samples from WARC file
Prepared FastText training data with 963 samples


### Train model

In [None]:
# Step 5: Train FastText model
model = fasttext.train_supervised(input=fasttext_data_file, epoch=500)
model.save_model(model_file)

print(f"Model saved to {model_file}")
print(f"Number of words: {len(model.words)}")
print(f"Number of labels: {len(model.labels)}")

Read 0M words
Number of words:  171596
Number of labels: 2
Progress:  99.3% words/sec/thread: 4033280 lr:  0.000662 avg.loss:  0.015998 ETA:   0h 0m 0s

Model trained and saved to ../models/wikipedia_quality_model.bin
Number of words: 171596
Number of labels: 2


Progress: 100.0% words/sec/thread: 4016661 lr:  0.000000 avg.loss:  0.015936 ETA:   0h 0m 0s


### Test Model

In [None]:
text = "The stocky, short-legged appearance of penguins has endeared them to people worldwide. They range from about 35 cm (14 inches) in height and approximately 1 kg (about 2 pounds) in weight in the blue, or fairy, penguin (Eudyptula minor) to 115 cm (45 inches) and 25 to 40 kg (55 to 90 pounds) in the emperor penguin (Aptenodytes forsteri). Most are black on the back and white below, often with lines of black across the upper breast or spots of white on the head. Colour is rare, being limited to red or yellow irises of the eye in some species; red beaks or feet in a few; yellow brow tufts in the three species of Eudyptes; and orange and yellow on the head, neck, and breast in the emperor and king (A. patagonica) penguins.The stocky, short-legged appearance of penguins has endeared them to people worldwide. They range from about 35 cm (14 inches) in height and approximately 1 kg (about 2 pounds) in weight in the blue, or fairy, penguin (Eudyptula minor) to 115 cm (45 inches) and 25 to 40 kg (55 to 90 pounds) in the emperor penguin (Aptenodytes forsteri). Most are black on the back and white below, often with lines of black across the upper breast or spots of white on the head. Colour is rare, being limited to red or yellow irises of the eye in some species; red beaks or feet in a few; yellow brow tufts in the three species of Eudyptes; and orange and yellow on the head, neck, and breast in the emperor and king (A. patagonica) penguins. The total populations of some species, such as the emperor, are estimated in the hundreds of thousands, but most species of smaller penguins certainly run into the millions. Immense island breeding colonies, some teeming with hundreds of thousands of nesting pairs, represent a large potential food resource, but the economic importance of penguins is negligible. Nineteenth-century whalers and seal hunters visited some colonies for meat and eggs, and a penguin oil industry once took large numbers of the birds. By the early 20th century, however, this exploitation was no longer profitable, and most colonies were left alone or actively protected. Some species are now increasing in numbers, apparently as a result of the mid-20th century’s decimation of Antarctic whales, which compete with penguins for the krill (minute crustaceans) on which both feed. Penguin populations, however, are highly vulnerable to changes in climate and ocean temperature, including recent global warming. Penguins also are very sensitive to depletion of local fish populations by humans.The total populations of some species, such as the emperor, are estimated in the hundreds of thousands, but most species of smaller penguins certainly run into the millions. Immense island breeding colonies, some teeming with hundreds of thousands of nesting pairs, represent a large potential food resource, but the economic importance of penguins is negligible. Nineteenth-century whalers and seal hunters visited some colonies for meat and eggs, and a penguin oil industry once took large numbers of the birds. By the early 20th century, however, this exploitation was no longer profitable, and most colonies were left alone or actively protected. Some species are now increasing in numbers, apparently as a result of the mid-20th century’s decimation of Antarctic whales, which compete with penguins for the krill (minute crustaceans) on which both feed. Penguin populations, however, are highly vulnerable to changes in climate and ocean temperature, including recent global warming. Penguins also are very sensitive to depletion of local fish populations by humans."
try:
    model = fasttext.load_model(str(model_file))
except Exception as e:
    raise RuntimeError(f"Failed to load the FastText model: {e}")
    
predictions = model.predict([text])

label = predictions[0][0][0]  # First label
confidence = predictions[1][0]  # Confidence for first label

if label.startswith("__label__"):
    label = label[9:]  # Remove __label__

print(f"{label}, {float(confidence)}")

wiki, 1.0000072717666626


  print(f"{label}, {float(confidence)}")


In [6]:
text = "data the her rick highschool pineapple donuts recipe vacation bee movie"
try:
    model = fasttext.load_model(str(model_file))
except Exception as e:
    raise RuntimeError(f"Failed to load the FastText model: {e}")
    
predictions = model.predict([text])

label = predictions[0][0][0]  # First label
confidence = predictions[1][0]  # Confidence for first label

if label.startswith("__label__"):
    label = label[9:]  # Remove __label__

print(f"{label}, {float(confidence)}")

cc, 1.0000077486038208


  print(f"{label}, {float(confidence)}")
