
## Files

- `./cnn-7.csv`
- `./cnn-8.csv`
- `./foxnews-transcript-urls-2025.csv`
- `./foxnews-html/` *extracted from* `fnc_transcripts_html_2025.tar.gz.part{1-4}`

In [None]:
import pandas as pd

In [None]:
cnn_7 = pd.read_csv("cnn-7.csv")
cnn_8 = pd.read_csv("cnn-8.csv")
cnn = pd.concat([cnn_7, cnn_8])
del cnn_7
del cnn_8
cnn.info()

In [None]:
start_date=pd.Timestamp("2015-01-01", tz="UTC")
end_date=pd.Timestamp("2025-03-01", tz="UTC")

In [None]:
cnn["timestamp"] = cnn.apply(lambda x: f"{x['year']:.0f}-{x['month']:02.0f}-{x['date']:02.0f} {x['time']}", axis=1)
cnn["ts"]=pd.to_datetime(cnn["timestamp"], errors='coerce')
cnn["ts"] = cnn["ts"].dt.tz_localize("America/New_York", ambiguous=True).dt.tz_convert("UTC")
cnn_cleaned=cnn[["ts", "subhead", "text"]].rename(columns={"subhead":"head"})
cnn_cleaned=cnn_cleaned[(cnn_cleaned["ts"]>=start_date) & (cnn_cleaned["ts"]<end_date)]
cnn_cleaned["text"]=cnn_cleaned["head"]+".  "+cnn_cleaned["text"]
del cnn_cleaned["head"]
cnn_cleaned.describe()

In [None]:
cnn_cleaned.to_csv("cnn.csv", index=False)

In [None]:
del cnn
del cnn_cleaned

In [None]:
fox=pd.read_csv("foxnews-transcript-urls-2025.csv")[["publicationDate", "title", "html_file"]]
fox["html_file"]=fox["html_file"].apply(lambda x: x.replace(".html", ""))
fox

In [None]:
import os
import gzip
import shutil
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

INPUT_DIR = "./foxnews-html"
HTML_DIR = "./foxnews-html-decompressed"
if not os.path.exists(HTML_DIR):
    os.makedirs(HTML_DIR)

def extract_gz(filename):
    with gzip.open(os.path.join(INPUT_DIR, filename), 'rb') as f_in:
        with open(os.path.join(HTML_DIR, filename[:-3]), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    return True

gz_files=[]
for root, dirs, files in os.walk(INPUT_DIR):
    gz_files = files

with Pool(cpu_count()) as pool:
    for _ in tqdm(
        pool.imap_unordered(extract_gz, gz_files),
        total=len(gz_files),
        desc="Decompressing .gz files"
    ):
        pass

In [None]:
from bs4 import BeautifulSoup
import json

TEXT_DIR = "./foxnews-text"
if not os.path.exists(TEXT_DIR):
    os.makedirs(TEXT_DIR)

def extract_fox_transcript_from_html(html: str) -> str | None:
    soup = BeautifulSoup(html, "lxml")

    for script in soup.find_all("script", type="application/ld+json"):
        if not script.string:
            continue

        try:
            data = json.loads(script.string)
        except json.JSONDecodeError:
            continue

        candidates = []
        if isinstance(data, dict):
            candidates = [data]
        elif isinstance(data, list):
            candidates = data

        for node in candidates:
            if not isinstance(node, dict):
                continue
            if node.get("@type") == "NewsArticle" and "articleBody" in node:
                text = node["articleBody"]
                return " ".join(text.split())

    return None

def extract_text(filename):
    with open(os.path.join(HTML_DIR, filename), "rb") as f:
        html = f.read()
    text=extract_fox_transcript_from_html(html)
    if text:
        with open(os.path.join(TEXT_DIR, filename.replace(".html", ".txt")), "w") as f:
            f.write(text)
    return True

html_files=[]
for root, dirs, files in os.walk(HTML_DIR):
    html_files = files

with Pool(cpu_count()) as pool:
    for _ in tqdm(
        pool.imap_unordered(extract_text, html_files),
        total=len(html_files),
        desc="Extracting articleBody from .html files"
    ):
        pass

In [None]:
fox_text=[]
for _, __, files in os.walk("./foxnews-text"):
    for file in files:
        with open("./foxnews-text/"+file, "r") as f:
            fox_text.append({
                "text": f.read(),
                "html_file": file.replace(".txt", "")
            })
fox_text = pd.DataFrame(fox_text)
fox_text

In [None]:
fox_cleaned = pd.merge(fox, fox_text, how="inner", on="html_file")
fox_cleaned

In [None]:
fox_cleaned["publicationDate"]=pd.to_datetime(fox_cleaned["publicationDate"])
fox_cleaned=fox_cleaned[(fox_cleaned["publicationDate"]>=start_date) & (fox_cleaned["publicationDate"]<end_date)]
fox_cleaned=fox_cleaned[["publicationDate", "text"]].rename(columns={"publicationDate": "ts"})
fox_cleaned

In [None]:
fox_cleaned.to_csv("fox.csv", index=False)

## Expected cleansed output

- `cnn.csv`
- `fox.csv`
