
## Files

- `./cnn-7.csv`
- `./cnn-8.csv`
- `./foxnews-transcript-urls-2025.csv`
- `./foxnews-html/` *extracted from* `fnc_transcripts_html_2025.tar.gz.part{1-4}`

In [1]:
import pandas as pd

In [2]:
cnn_7 = pd.read_csv("cnn-7.csv")
cnn_8 = pd.read_csv("cnn-8.csv")
cnn = pd.concat([cnn_7, cnn_8])
del cnn_7
del cnn_8
cnn.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146020 entries, 0 to 43561
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   url           146020 non-null  object 
 1   channel.name  145202 non-null  object 
 2   program.name  145202 non-null  object 
 3   uid           43562 non-null   float64
 4   duration      0 non-null       float64
 5   year          145139 non-null  float64
 6   month         145139 non-null  float64
 7   date          145139 non-null  float64
 8   time          145139 non-null  object 
 9   timezone      145139 non-null  object 
 10  path          43562 non-null   object 
 11  wordcount     43562 non-null   float64
 12  subhead       145184 non-null  object 
 13  text          145126 non-null  object 
dtypes: float64(6), object(8)
memory usage: 16.7+ MB


In [3]:
start_date=pd.Timestamp("2015-01-01", tz="UTC")
end_date=pd.Timestamp("2025-03-01", tz="UTC")

In [4]:
cnn["timestamp"] = cnn.apply(lambda x: f"{x['year']:.0f}-{x['month']:02.0f}-{x['date']:02.0f} {x['time']}", axis=1)
cnn["ts"]=pd.to_datetime(cnn["timestamp"], errors='coerce')
cnn["ts"] = cnn["ts"].dt.tz_localize("America/New_York", ambiguous=True).dt.tz_convert("UTC")
cnn_cleaned=cnn[["ts", "subhead", "text"]].rename(columns={"ts":"date", "subhead":"head"})
cnn_cleaned=cnn_cleaned[(cnn_cleaned["date"]>=start_date) & (cnn_cleaned["date"]<end_date)]
cnn_cleaned["text"]=cnn_cleaned["head"]+".  "+cnn_cleaned["text"]
del cnn_cleaned["head"]
cnn_cleaned.describe()

Unnamed: 0,date,text
count,138148,138068
unique,,132382
top,,Did Not Air 4-5a ET. Did Not Air 4-5a ET
freq,,362
mean,2020-03-15 08:34:18.027622400+00:00,
min,2015-01-01 00:00:00+00:00,
25%,2017-09-10 00:45:00+00:00,
50%,2020-04-14 04:30:00+00:00,
75%,2022-09-14 09:00:00+00:00,
max,2025-02-28 23:00:00+00:00,


In [5]:
cnn_cleaned.to_csv("cnn.csv", index=False)

In [6]:
del cnn
del cnn_cleaned

In [7]:
fox=pd.read_csv("foxnews-transcript-urls-2025.csv")[["publicationDate", "title", "html_file"]]
fox["html_file"]=fox["html_file"].apply(lambda x: x.replace(".html", ""))
fox

Unnamed: 0,publicationDate,title,html_file
0,2024-10-27 18:14:43+00:00,"'Fox News Sunday' on October 20, 2024",fox-news-sunday-october-20-2024
1,2024-10-07 13:01:55+00:00,"'Fox News Sunday' on September 15, 2024",fox-news-sunday-september-15-2024
2,2024-09-10 22:04:22+00:00,"'Fox News Sunday' on September 8, 2024",fox-news-sunday-september-8-2024
3,2024-08-25 17:40:15+00:00,"'Fox News Sunday' on August 25, 2024",fox-news-sunday-august-25-2024
4,2024-07-21 17:53:48+00:00,"'Fox News Sunday' on July 21, 2024",fox-news-sunday-july-21-2024
...,...,...,...
87613,2016-12-21 13:23:21+00:00,The transition from President Obama to Preside...,the-transition-from-president-obama-to-preside...
87614,2016-12-22 13:41:48+00:00,Confronting terror around the world,confronting-terror-around-the-world
87615,2016-12-23 13:34:30+00:00,Europe under siege from Islamic terror,europe-under-siege-from-islamic-terror
87616,2016-12-29 13:49:26+00:00,Huckabee blasts Kerry's speech: 'Betrayal' to ...,huckabee-blasts-kerrys-speech-betrayal-to-israel


In [8]:
import os
import gzip
import shutil
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

INPUT_DIR = "./foxnews-html"
HTML_DIR = "./foxnews-html-decompressed"
if not os.path.exists(HTML_DIR):
    os.makedirs(HTML_DIR)

def extract_gz(filename):
    with gzip.open(os.path.join(INPUT_DIR, filename), 'rb') as f_in:
        with open(os.path.join(HTML_DIR, filename[:-3]), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    return True

gz_files=[]
for root, dirs, files in os.walk(INPUT_DIR):
    gz_files = files

with Pool(cpu_count()) as pool:
    for _ in tqdm(
        pool.imap_unordered(extract_gz, gz_files),
        total=len(gz_files),
        desc="Decompressing .gz files"
    ):
        pass

Decompressing .gz files: 100%|██████████| 87403/87403 [00:09<00:00, 9023.16it/s]


In [9]:
from bs4 import BeautifulSoup
import json

TEXT_DIR = "./foxnews-text"
if not os.path.exists(TEXT_DIR):
    os.makedirs(TEXT_DIR)

def extract_fox_transcript_from_html(html: str) -> str | None:
    soup = BeautifulSoup(html, "lxml")

    for script in soup.find_all("script", type="application/ld+json"):
        if not script.string:
            continue

        try:
            data = json.loads(script.string)
        except json.JSONDecodeError:
            continue

        candidates = []
        if isinstance(data, dict):
            candidates = [data]
        elif isinstance(data, list):
            candidates = data

        for node in candidates:
            if not isinstance(node, dict):
                continue
            if node.get("@type") == "NewsArticle" and "articleBody" in node:
                text = node["articleBody"]
                return " ".join(text.split())

    return None

def extract_text(filename):
    with open(os.path.join(HTML_DIR, filename), "rb") as f:
        html = f.read()
    text=extract_fox_transcript_from_html(html)
    if text:
        with open(os.path.join(TEXT_DIR, filename.replace(".html", ".txt")), "w") as f:
            f.write(text)
    return True

html_files=[]
for root, dirs, files in os.walk(HTML_DIR):
    html_files = files

with Pool(cpu_count()) as pool:
    for _ in tqdm(
        pool.imap_unordered(extract_text, html_files),
        total=len(html_files),
        desc="Extracting articleBody from .html files"
    ):
        pass

Extracting articleBody from .html files: 100%|██████████| 87403/87403 [02:06<00:00, 688.95it/s]


In [10]:
fox_text=[]
for _, __, files in os.walk("./foxnews-text"):
    for file in files:
        with open("./foxnews-text/"+file, "r") as f:
            fox_text.append({
                "text": f.read(),
                "html_file": file.replace(".txt", "")
            })
fox_text = pd.DataFrame(fox_text)
fox_text

Unnamed: 0,text,html_file
0,Federal agents are dumping thousands of illega...,tucker-carlson-our-leaders-will-never-defend-o...
1,"""Duck Dynasty"" star Phil Robertson took some c...",duck-dynasty-star-phil-robertson-on-aiding-tru...
2,"This is a rush transcript from ""Special Report...",special-report-all-star-panel-senate-push-new-...
3,The 40-year-old Houston policeman was buried t...,saying-good-bye-to-officer-rodney-johnson
4,If the IRS won't cut employees for hassling ta...,cavuto-time-for-irs-to-roll-in-a-little-less-cash
...,...,...
42399,"Surfside, Florida Mayor Charles Burkett update...",surfside-florida-mayor-building-collapse-third...
42400,"This is a rush transcript from ""On the Record,...",full-speed-ahead-on-obamas-new-auto-rules
42401,"This is a rush transcript of ""Special Report w...",special-report-all-star-panel-senates-effort-a...
42402,The media continue&nbsp;to win the brain-dead ...,gutfeld-media-falling-dog-meme


In [11]:
fox_cleaned = pd.merge(fox, fox_text, how="inner", on="html_file")
fox_cleaned

Unnamed: 0,publicationDate,title,html_file,text
0,2024-10-27 18:14:43+00:00,"'Fox News Sunday' on October 20, 2024",fox-news-sunday-october-20-2024,This is a rush transcript of ‘Fox News Sunday’...
1,2024-10-07 13:01:55+00:00,"'Fox News Sunday' on September 15, 2024",fox-news-sunday-september-15-2024,This is a rush transcript of 'Fox News Sunday'...
2,2024-09-10 22:04:22+00:00,"'Fox News Sunday' on September 8, 2024",fox-news-sunday-september-8-2024,This is a rush transcript of ‘Fox News Sunday’...
3,2024-08-25 17:40:15+00:00,"'Fox News Sunday' on August 25, 2024",fox-news-sunday-august-25-2024,This is a rush transcript of ‘Fox News Sunday’...
4,2024-07-21 17:53:48+00:00,"'Fox News Sunday' on July 21, 2024",fox-news-sunday-july-21-2024,"This is a rush transcript of ""Fox News Sunday""..."
...,...,...,...,...
42402,2016-12-21 13:23:21+00:00,The transition from President Obama to Preside...,the-transition-from-president-obama-to-preside...,"This is a RUSH transcript from ""The O'Reilly F..."
42403,2016-12-22 13:41:48+00:00,Confronting terror around the world,confronting-terror-around-the-world,"This is a RUSH transcript from ""The O'Reilly F..."
42404,2016-12-23 13:34:30+00:00,Europe under siege from Islamic terror,europe-under-siege-from-islamic-terror,"This is a RUSH transcript from ""The O'Reilly F..."
42405,2016-12-29 13:49:26+00:00,Huckabee blasts Kerry's speech: 'Betrayal' to ...,huckabee-blasts-kerrys-speech-betrayal-to-israel,"This is a RUSH transcript from ""The O'Reilly F..."


In [12]:
fox_cleaned["publicationDate"]=pd.to_datetime(fox_cleaned["publicationDate"])
fox_cleaned=fox_cleaned[(fox_cleaned["publicationDate"]>=start_date) & (fox_cleaned["publicationDate"]<end_date)]
fox_cleaned=fox_cleaned[["publicationDate", "text"]].rename(columns={"publicationDate": "ts"})
fox_cleaned

Unnamed: 0,ts,text
0,2024-10-27 18:14:43+00:00,This is a rush transcript of ‘Fox News Sunday’...
1,2024-10-07 13:01:55+00:00,This is a rush transcript of 'Fox News Sunday'...
2,2024-09-10 22:04:22+00:00,This is a rush transcript of ‘Fox News Sunday’...
3,2024-08-25 17:40:15+00:00,This is a rush transcript of ‘Fox News Sunday’...
4,2024-07-21 17:53:48+00:00,"This is a rush transcript of ""Fox News Sunday""..."
...,...,...
42402,2016-12-21 13:23:21+00:00,"This is a RUSH transcript from ""The O'Reilly F..."
42403,2016-12-22 13:41:48+00:00,"This is a RUSH transcript from ""The O'Reilly F..."
42404,2016-12-23 13:34:30+00:00,"This is a RUSH transcript from ""The O'Reilly F..."
42405,2016-12-29 13:49:26+00:00,"This is a RUSH transcript from ""The O'Reilly F..."


In [13]:
fox_cleaned.to_csv("fox.csv", index=False)

## Expected cleansed output

- `cnn.csv`
- `fox.csv`
