# ECE405 Assignment 2 — Data Filtering for Language Modeling

Based on CS336 Assignment 4 (Stanford, Spring 2025)

## 2.1 Looking at the data

### Setup: Download sample WARC and WET files

In [None]:
import os, shutil

# --- Configuration ---
REPO_URL = "https://github.com/bushuyeu/LLM-from-scratch.git"
REPO_DIR = "/content/LLM-from-scratch"
ASSIGNMENT_DIR = os.path.join(REPO_DIR, "ece405_assignment2")
BRANCH = "main"

# Clone / update the repo on the Colab kernel
if os.path.exists(os.path.join(REPO_DIR, ".git")):
    !git -C {REPO_DIR} pull
else:
    # Clean up any leftover non-git directory
    if os.path.exists(REPO_DIR):
        shutil.rmtree(REPO_DIR)
    !git clone -b {BRANCH} {REPO_URL} {REPO_DIR}

# Install dependencies:
# 1. Common pip packages needed by the notebook and cs336-data
!pip install -q warcio resiliparse fasttext tldextract xopen "numpy<2.0" tqdm nltk mmh3

# 2. cs336-basics (local sub-package required by cs336-data)
!pip install -e {ASSIGNMENT_DIR}/cs336-basics

# 3. cs336-data itself (--no-deps because pip can't resolve the uv-specific
#    local-path source for cs336-basics; we already installed it above)
!pip install --no-deps -e {ASSIGNMENT_DIR}

# Verify install
!pip show cs336-data

# Set up data directory
DATA_DIR = os.path.join(ASSIGNMENT_DIR, "data")
os.makedirs(DATA_DIR, exist_ok=True)

In [2]:
WARC_URL = "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-18/segments/1744889135610.12/warc/CC-MAIN-20250417135010-20250417165010-00065.warc.gz"
WET_URL = "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-18/segments/1744889135610.12/wet/CC-MAIN-20250417135010-20250417165010-00065.warc.wet.gz"

WARC_PATH = os.path.join(DATA_DIR, "example.warc.gz")
WET_PATH = os.path.join(DATA_DIR, "example.warc.wet.gz")

In [3]:
# Download WARC file (approx 1GB compressed)
if not os.path.exists(WARC_PATH):
    !wget -O {WARC_PATH} "{WARC_URL}"
else:
    print(f"WARC file already exists at {WARC_PATH}")

WARC file already exists at /content/LLM-from-scratch/ece405_assignment2/data/example.warc.gz


In [4]:
# Download WET file
if not os.path.exists(WET_PATH):
    !wget -O {WET_PATH} "{WET_URL}"
else:
    print(f"WET file already exists at {WET_PATH}")

WET file already exists at /content/LLM-from-scratch/ece405_assignment2/data/example.warc.wet.gz


### (a) Examine the first page in the WARC file

In [5]:
from warcio.archiveiterator import ArchiveIterator

with open(WARC_PATH, "rb") as f:
    for i, record in enumerate(ArchiveIterator(f)):
        print(f"Record {i}: type={record.rec_type}")
        if record.rec_type == "response":
            print(f"  URL: {record.rec_headers.get_header('WARC-Target-URI')}")
            print(f"  Date: {record.rec_headers.get_header('WARC-Date')}")
            print(f"  Content-Type: {record.http_headers.get_header('Content-Type') if record.http_headers else 'N/A'}")
            print()
            # Print first 3000 chars of HTML content
            content = record.content_stream().read()
            print("--- First 3000 chars of raw content ---")
            print(content[:3000].decode('utf-8', errors='replace'))
            break

Record 0: type=warcinfo
Record 1: type=request
Record 2: type=response
  URL: http://0371rykj.com/ipfhsb/34.html
  Date: 2025-04-17T14:56:33Z
  Content-Type: text/html

--- First 3000 chars of raw content ---
<!DOCTYPE html>
<html>
 <head> 
   <meta charset="UTF-8">
  
  <title>&#x4EBA;&#x59BB;&#x2C;&#x56FD;&#x5185;&#x8001;&#x719F;&#x5987;&#x5BF9;&#x767D;&#x48;&#x44;&#x58;&#x58;&#x58;&#x58;&#x2C;&#x4E9A;&#x6D32;&#x41;&#x56;&#x65E0;&#x7801;&#x4E00;&#x533A;&#x4E1C;&#x4EAC;&#x70ED;&#x4E45;&#x4E45;&#x0D;</title>
  <meta name="keywords" content="&#x4EBA;&#x59BB;&#x2C;&#x56FD;&#x5185;&#x8001;&#x719F;&#x5987;&#x5BF9;&#x767D;&#x48;&#x44;&#x58;&#x58;&#x58;&#x58;&#x2C;&#x4E9A;&#x6D32;&#x41;&#x56;&#x65E0;&#x7801;&#x4E00;&#x533A;&#x4E1C;&#x4EAC;&#x70ED;&#x4E45;&#x4E45;&#x0D;" />
  <meta name="description" content="&#x4EBA;&#x59BB;&#x2C;&#x56FD;&#x5185;&#x8001;&#x719F;&#x5987;&#x5BF9;&#x767D;&#x48;&#x44;&#x58;&#x58;&#x58;&#x58;&#x2C;&#x4E9A;&#x6D32;&#x41;&#x56;&#x65E0;&#x7801;&#x4E00;&#x533A;&#x4E1

### (b) Examine the corresponding WET file

In [6]:
with open(WET_PATH, "rb") as f:
    for i, record in enumerate(ArchiveIterator(f)):
        if record.rec_type == "conversion":
            url = record.rec_headers.get_header('WARC-Target-URI')
            content = record.content_stream().read().decode('utf-8', errors='replace')
            print(f"URL: {url}")
            print(f"Content length: {len(content)} chars")
            print()
            print("--- Extracted text (first 3000 chars) ---")
            print(content[:3000])
            break

URL: http://0371rykj.com/ipfhsb/34.html
Content length: 3496 chars

--- Extracted text (first 3000 chars) ---
人妻,国内老熟妇对白HDXXXX,亚洲AV无码一区东京热久久
久久久久女人精品毛片,99久久精品无码一区二区毛片,被老外的又粗又大日出了水,一边吃奶一边哭乱抻又乱扭
恒溫恒濕試驗(yàn)箱
在線(xiàn)咨詢(xún)
上海林頻儀器股份有限公司Shanghai Linpin Instrument Stock Co Ltd
服務(wù)熱線(xiàn)：4000 662 888
手機(jī)咨詢(xún)：13818467052
首頁(yè)
林頻產(chǎn)品
試驗(yàn)箱系列
老化箱系列
非標(biāo)定制系列
ip防護(hù)系列
振動(dòng)跌落系列
成功案例
新聞中心
林頻新聞
行業(yè)新聞
常見(jiàn)問(wèn)題
解決方案
關(guān)于林頻
服務(wù)支持
聯(lián)系我們
您所在的位置：
恒溫恒濕試驗(yàn)箱 > 林頻產(chǎn)品 > ip防護(hù)系列 >
產(chǎn)品詳情/ products details
恒溫恒濕試驗(yàn)箱
產(chǎn)品用途
恒溫恒濕試驗(yàn)箱是航空、汽車(chē)、家電、科研等領(lǐng)域必備的測(cè)試設(shè)備，用于測(cè)試和確定電工、電子及其他產(chǎn)品及材料進(jìn)行高溫、低溫、濕熱度或恒定試驗(yàn)的溫度環(huán)境變化后的參數(shù)及性能。...
了解詳情 立即咨詢(xún)
產(chǎn)品參數(shù) / parameter
設(shè)備型號(hào) 工作室尺寸(D*W*H)mm 外型尺寸(D*W*H)mm
LRHS-101-LH 450×450×500 1160×1000×1610
LRHS-225-LH 500×600×750 1210×1150×1870
LRHS-504-LH 700×800×900 1260×1340×2070
LRHS-800-LH 800×1000×1000 1370×1550×2170
LRHS-1000-LH 1000×1000×1000 1560×155

### (d) Annotate 25 WET records

In [7]:
with open(WET_PATH, "rb") as f:
    count = 0
    for record in ArchiveIterator(f):
        if record.rec_type == "conversion":
            url = record.rec_headers.get_header('WARC-Target-URI')
            content = record.content_stream().read().decode('utf-8', errors='replace')
            print(f"\n{'='*80}")
            print(f"Record {count + 1}")
            print(f"URL: {url}")
            print(f"Content length: {len(content)} chars")
            print(f"First 500 chars:")
            print(content[:500])
            print(f"{'='*80}")
            count += 1
            if count >= 25:
                break


Record 1
URL: http://0371rykj.com/ipfhsb/34.html
Content length: 3496 chars
First 500 chars:
人妻,国内老熟妇对白HDXXXX,亚洲AV无码一区东京热久久
久久久久女人精品毛片,99久久精品无码一区二区毛片,被老外的又粗又大日出了水,一边吃奶一边哭乱抻又乱扭
恒溫恒濕試驗(yàn)箱
在線(xiàn)咨詢(xún)
上海林頻儀器股份有限公司Shanghai Linpin Instrument Stock Co Ltd
服務(wù)熱線(xiàn)：4000 662 888
手機(jī)咨詢(xún)：13818467052
首頁(yè)
林頻產(chǎn)品
試驗(yàn)箱系列
老化箱系列
非標(biāo)定制系列
ip防護(hù)系列
振動(dòng)跌落系列
成功案例
新聞中心
林頻新聞
行業(yè)新聞
常見(jiàn)問(wèn)題
解決方案
關(guān)于林頻
服務(wù)支持
聯(lián)系我們
您所在的位置：
恒溫恒濕試驗(yàn)箱 > 林頻產(chǎn)品 > ip防護(hù)系列 >
產(chǎn)品詳情/ products details
恒溫恒濕試驗(yàn)箱
產(chǎn)品用途
恒溫恒濕試驗(yàn)箱是航空、汽車(chē)、家電、科研等領(

Record 2
URL: http://10www.chinatikfans.com/home.php?mod=space&uid=4693&do=blog&classid=104&view=me
Content length: 2066 chars
First 500 chars:
lily_zl的日志 - &#1769;杰西达邦中国影迷会&#1769; - Powered by Discuz!
设为首页收藏本站
开启辅助访问 切换到窄版
帐号 自动登录 找回密码
密码
登录
加入我们
只需一步，快速开始
快捷导航
广场BBS
家园Space
每日签到
排行榜Ranklist
手机论坛
搜索
搜索
用户
&#1769;杰西达邦中国影迷会&#1769; › 日志
际遇
发布 日志
上传 相册
添加 分享
记录
日志
好友的日志
我的日志
随便看看
发表新日志
discussion on plot

## 2.2 HTML to text conversion

### (a) Extract text from HTML bytes using Resiliparse

In [8]:
from cs336_data.extract import extract_text_from_html_bytes

### (b) Compare Resiliparse extraction vs WET extraction on the first page

In [9]:
# Extract text from the first WARC response using our function, and compare with WET
from warcio.archiveiterator import ArchiveIterator

# Get Resiliparse extraction from WARC
resiliparse_text = None
with open(WARC_PATH, "rb") as f:
    for record in ArchiveIterator(f):
        if record.rec_type == "response":
            html_bytes = record.content_stream().read()
            resiliparse_text = extract_text_from_html_bytes(html_bytes)
            warc_url = record.rec_headers.get_header('WARC-Target-URI')
            break

# Get WET extraction
wet_text = None
with open(WET_PATH, "rb") as f:
    for record in ArchiveIterator(f):
        if record.rec_type == "conversion":
            wet_text = record.content_stream().read().decode('utf-8', errors='replace')
            break

print(f"URL: {warc_url}")
print(f"Resiliparse output length: {len(resiliparse_text)} chars")
print(f"WET output length:         {len(wet_text)} chars")
print()
print("=" * 80)
print("RESILIPARSE EXTRACTION (first 2000 chars)")
print("=" * 80)
print(resiliparse_text[:2000])
print()
print("=" * 80)
print("WET EXTRACTION (first 2000 chars)")
print("=" * 80)
print(wet_text[:2000])

URL: http://0371rykj.com/ipfhsb/34.html
Resiliparse output length: 10165 chars
WET output length:         3496 chars

RESILIPARSE EXTRACTION (first 2000 chars)
久久久久女人精品毛片,99久久精品无码一区二区毛片,被老外的又粗又大日出了水,一边吃奶一边哭乱抻又乱扭

        • <th id="gckmo"></th>
        • <ul id="gckmo"><center id="gckmo"></center></ul>
      •  
  
 
 
 
         
   
         
    
         
    
        恒溫恒濕試驗(yàn)箱
        
	在線(xiàn)咨詢(xún)
    
         
    
         
      淋雨試驗(yàn)箱 
     
        上海林頻儀器股份有限公司Shanghai Linpin Instrument Stock Co Ltd
         
     
        服務(wù)熱線(xiàn)：4000 662 888 
         手機(jī)咨詢(xún)：13818467052
        
    
         
    
         
     
           
	
      
        • 首頁(yè)
          •  
	
	
        • 
	  林頻產(chǎn)品

	  
            
		
          • 試驗(yàn)箱系列
          • 老化箱系列
          • 非標(biāo)定制系列
          • ip防護(hù)系列
          • 振動(dòng)跌落系列
            •  
	  
          

	  
        • 
	  成功案例

	  
            
		 
	  
          

	  
        • 
	  新聞中心

	  
   

## 2.3 Language identification

### Setup: Download fastText language ID model

In [10]:
LID_MODEL_PATH = os.path.join(DATA_DIR, "lid.176.bin")
if not os.path.exists(LID_MODEL_PATH):
    !wget -O {LID_MODEL_PATH} "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
else:
    print(f"Language ID model already exists at {LID_MODEL_PATH}")

Language ID model already exists at /content/LLM-from-scratch/ece405_assignment2/data/lid.176.bin


### (a) Language identification function

In [11]:
from cs336_data.language_identification import identify_language, set_lid_model_path

set_lid_model_path(LID_MODEL_PATH)

# Quick sanity checks
print(identify_language("Hello, this is a test in English."))
print(identify_language("欢迎来到我们的网站"))
print(identify_language("Bonjour, comment allez-vous?"))

('en', 0.9398366808891296)
('zh', 1.000056266784668)
('fr', 0.9740094542503357)


### (c) Run language ID on 20 random WET records, compare to manual judgment

In [12]:
import random
from warcio.archiveiterator import ArchiveIterator

# Collect first 200 WET records, then sample 20
records = []
with open(WET_PATH, "rb") as f:
    for record in ArchiveIterator(f):
        if record.rec_type == "conversion":
            url = record.rec_headers.get_header('WARC-Target-URI')
            content = record.content_stream().read().decode('utf-8', errors='replace')
            if len(content.strip()) > 50:  # skip near-empty records
                records.append((url, content))
            if len(records) >= 200:
                break

random.seed(42)
sample = random.sample(records, 20)

print(f"Sampled {len(sample)} records from {len(records)} total\n")

for i, (url, content) in enumerate(sample):
    lang, score = identify_language(content)
    print(f"Record {i+1:2d} | lang={lang:5s} score={score:.4f} | {url}")
    print(f"           First 120 chars: {content[:120].replace(chr(10), ' ')}")
    print()

Sampled 20 records from 200 total

Record  1 | lang=id    score=0.5478 | http://bagsguccistoer.com/rise%20of%20apollo%20pg
           First 120 chars: แนะนำ 10 rise of apollo pg ไม่ผ่านเอเย่นต์ งบน้อยเล่นได้ ฝากถอนไม่มีขั้นต่ำ INTERNAL FEEDBACK rise of apollo pg สล็อตเว็

Record  2 | lang=zh    score=0.9596 | http://50899.cn/news/16755.html
           First 120 chars: A级毛片无码真人久久久,欧美性饥渴少妇BBB.BBB片 午夜亚洲影院在线观看,日韩欧美精品视频一区二区三区,久免费视频,黄网人妻视频,99视频在线观看精品,午夜A级性爱 BJFQY 經(jīng)營品牌 風(fēng)富圖 佳能 大疆 寶麗

Record  3 | lang=zh    score=0.9951 | http://18sex.v340.info/?&R2=&P=11&OP=&CHANNEL=
           First 120 chars: 本土自拍性愛影片 回 首 頁 │ 點數購買 │ 付款方式 │ 加入會員 │會員登入 │主持人登入 │ 線上客服 │使用說明 │ 使用條款 │ 台妹區 | 內地主播區 | 業績排行 | 會員評價 | 包廂線上人數 | 一對多點數 | 一對一點

Record  4 | lang=en    score=0.8122 | http://beyparkotel.com/
           First 120 chars: Beypark Otel Your browser does not support frames. Untitled Page 

Record  5 | lang=ru    score=0.9911 | http://agrokenya.org/2024/02/10/kak-mozhno-zarabotat-realnye-pin-up-skachat-n

## 2.4 PII masking

### (a)–(c) PII masking functions

In [None]:
from cs336_data.pii import mask_emails, mask_phone_numbers, mask_ips

# Sanity checks
print(mask_emails("Contact me at test@gmail.com or admin@example.org"))
print(mask_phone_numbers("Call (283)-182-3829 or 2831823829"))
print(mask_ips("Server at 192.0.2.146 and 10.0.0.1"))

### (5) Run PII masking on extracted text, examine 20 random replacements

In [None]:
import random
from warcio.archiveiterator import ArchiveIterator
from cs336_data.extract import extract_text_from_html_bytes
from cs336_data.pii import mask_emails, mask_phone_numbers, mask_ips

# Extract text from WARC pages and apply PII masking
pii_results = []
with open(WARC_PATH, "rb") as f:
    for record in ArchiveIterator(f):
        if record.rec_type == "response":
            html_bytes = record.content_stream().read()
            text = extract_text_from_html_bytes(html_bytes)
            if text and len(text.strip()) > 100:
                masked_email, n_emails = mask_emails(text)
                masked_phone, n_phones = mask_phone_numbers(masked_email)
                masked_all, n_ips = mask_ips(masked_phone)
                total = n_emails + n_phones + n_ips
                if total > 0:
                    pii_results.append({
                        "url": record.rec_headers.get_header('WARC-Target-URI'),
                        "original": text,
                        "masked": masked_all,
                        "n_emails": n_emails,
                        "n_phones": n_phones,
                        "n_ips": n_ips,
                    })
        if len(pii_results) >= 100:
            break

print(f"Found {len(pii_results)} pages with PII out of first ~100+ WARC responses\n")

# Show 20 random examples
random.seed(42)
sample = random.sample(pii_results, min(20, len(pii_results)))

for i, r in enumerate(sample):
    print(f"{'='*80}")
    print(f"Example {i+1} | URL: {r['url']}")
    print(f"  Emails: {r['n_emails']}, Phones: {r['n_phones']}, IPs: {r['n_ips']}")
    # Show context around each replacement
    for marker in ["|||EMAIL_ADDRESS|||", "|||PHONE_NUMBER|||", "|||IP_ADDRESS|||"]:
        idx = 0
        while True:
            pos = r["masked"].find(marker, idx)
            if pos == -1:
                break
            start = max(0, pos - 40)
            end = min(len(r["masked"]), pos + len(marker) + 40)
            context = r["masked"][start:end].replace("\n", " ")
            print(f"  ...{context}...")
            idx = pos + len(marker)
    print()

## 2.5 Harmful content classification

### Setup: Download Dolma fastText classifiers

In [None]:
NSFW_MODEL_PATH = os.path.join(DATA_DIR, "dolma_fasttext_nsfw_jigsaw_model.bin")
TOXIC_MODEL_PATH = os.path.join(DATA_DIR, "dolma_fasttext_hatespeech_jigsaw_model.bin")

if not os.path.exists(NSFW_MODEL_PATH):
    !wget -O {NSFW_MODEL_PATH} "https://huggingface.co/allenai/dolma-jigsaw-fasttext-bigrams-nsfw/resolve/main/model.bin"
else:
    print(f"NSFW model already exists at {NSFW_MODEL_PATH}")

if not os.path.exists(TOXIC_MODEL_PATH):
    !wget -O {TOXIC_MODEL_PATH} "https://huggingface.co/allenai/dolma-jigsaw-fasttext-bigrams-hatespeech/resolve/main/model.bin"
else:
    print(f"Toxic speech model already exists at {TOXIC_MODEL_PATH}")

### (1)–(2) NSFW and toxic speech classifiers

In [None]:
from cs336_data.harmful_content import classify_nsfw, classify_toxic_speech, set_nsfw_model_path, set_toxic_model_path

set_nsfw_model_path(NSFW_MODEL_PATH)
set_toxic_model_path(TOXIC_MODEL_PATH)

# Sanity checks
print("NSFW classifier:")
print(classify_nsfw("This is a normal sentence about cooking."))
print(classify_nsfw("SUCK MY C*CK WIKIPEDIA EDITORS...F*CKING *SSH*LE DORKS."))
print()
print("Toxic speech classifier:")
print(classify_toxic_speech("The weather is nice today."))
print(classify_toxic_speech("What a rude fuck. Arrogant twat who doesn't know what he's talking about."))

### (4) Run on extracted text, compare 20 predictions to own judgment

In [None]:
import random
from warcio.archiveiterator import ArchiveIterator
from cs336_data.extract import extract_text_from_html_bytes

# Extract text from first 200 WARC pages
texts = []
with open(WARC_PATH, "rb") as f:
    for record in ArchiveIterator(f):
        if record.rec_type == "response":
            html_bytes = record.content_stream().read()
            text = extract_text_from_html_bytes(html_bytes)
            url = record.rec_headers.get_header('WARC-Target-URI')
            if text and len(text.strip()) > 100:
                texts.append((url, text))
        if len(texts) >= 200:
            break

random.seed(42)
sample = random.sample(texts, 20)

print(f"Sampled {len(sample)} pages from {len(texts)} total\n")

for i, (url, text) in enumerate(sample):
    nsfw_label, nsfw_score = classify_nsfw(text)
    toxic_label, toxic_score = classify_toxic_speech(text)
    print(f"Record {i+1:2d} | nsfw={nsfw_label:8s} ({nsfw_score:.4f}) | toxic={toxic_label:9s} ({toxic_score:.4f}) | {url}")
    print(f"           First 120 chars: {text[:120].replace(chr(10), ' ')}")
    print()