# Scrape data from pbs documentaries here
https://www.pbs.org/wgbh/nova/transcripts/

In [6]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [48]:
def save_topic(transcript_urls, savepath):
    result = ""
    for single_url in tqdm(transcript_urls):
        single_page = requests.get(single_url)
        single_soup = BeautifulSoup(single_page.content, 'html.parser')

        # Remove unwanted data (note the comma!)
        to_remove = single_soup.select('#d_feature_copy .caption,' +  # remove caption
                                       '#d_feature_copy h2 ~ p') # remove credits
        for item_to_remove in to_remove:
            item_to_remove.extract()

        for line in single_soup.select('#d_feature_copy p'):
            result += " ".join(line.text.split()) + "\n"

    with open(savepath, 'w', encoding="UTF") as f:
        f.write(result)

In [49]:
save_topic(transcript_urls, 'data/nova_anthropology_transcripts.txt')

100%|██████████| 49/49 [00:10<00:00,  5.06it/s]


In [53]:
def by_topic():
    URL_BASE = 'https://www.pbs.org'
    pairs = [
        ('https://www.pbs.org/wgbh/nova/transcripts/int_anth.html', 'data/nova_anthropology_transcripts.txt'),
        ('https://www.pbs.org/wgbh/nova/transcripts/int_disa.html', 'data/nova_disaster_transcripts.txt'),
        ('https://www.pbs.org/wgbh/nova/transcripts/int_eart.html', 'data/nova_earth_transcripts.txt'),
        ('https://www.pbs.org/wgbh/nova/transcripts/int_expl.html', 'data/nova_exploration_transcripts.txt'),
        ('https://www.pbs.org/wgbh/nova/transcripts/int_flig.html', 'data/nova_flight_transcripts.txt'),
        ('https://www.pbs.org/wgbh/nova/transcripts/int_heal.html', 'data/nova_health_transcripts.txt'),
        ('https://www.pbs.org/wgbh/nova/transcripts/int_hist.html', 'data/nova_history_transcripts.txt'),
        ('https://www.pbs.org/wgbh/nova/transcripts/int_inve.html', 'data/nova_investigations_transcripts.txt'),
        ('https://www.pbs.org/wgbh/nova/transcripts/int_natu.html', 'data/nova_nature_transcripts.txt'),
        ('https://www.pbs.org/wgbh/nova/transcripts/int_phys.html', 'data/nova_physicsmath_transcripts.txt'),
        ('https://www.pbs.org/wgbh/nova/transcripts/int_spac.html', 'data/nova_space_transcripts.txt'),
        ('https://www.pbs.org/wgbh/nova/transcripts/int_tech.html', 'data/nova_technology_transcripts.txt'),
    ]
    for page_pair in pairs:
        URL = page_pair[0]
        SAVEPATH = page_pair[1]
        
        page = requests.get(URL)

        soup = BeautifulSoup(page.content, 'html.parser')

        # soup.find() is one interface
        # soup.select uses css selection which I'm more familiar with
        transcript_urls = [URL_BASE + a['href'] for a in soup.select('#d_feature_copy p a')]
        print(f"{URL}: Total of {len(transcript_urls)} links to scrape")
        save_topic(transcript_urls, SAVEPATH)
by_topic()

https://www.pbs.org/wgbh/nova/transcripts/int_anth.html: Total of 49 links to scrape


100%|██████████| 49/49 [00:10<00:00,  4.82it/s]


https://www.pbs.org/wgbh/nova/transcripts/int_disa.html: Total of 24 links to scrape


100%|██████████| 24/24 [00:06<00:00,  3.25it/s]


https://www.pbs.org/wgbh/nova/transcripts/int_eart.html: Total of 45 links to scrape


100%|██████████| 45/45 [00:12<00:00,  4.31it/s]


https://www.pbs.org/wgbh/nova/transcripts/int_expl.html: Total of 35 links to scrape


100%|██████████| 35/35 [00:07<00:00,  4.91it/s]


https://www.pbs.org/wgbh/nova/transcripts/int_flig.html: Total of 21 links to scrape


100%|██████████| 21/21 [00:04<00:00,  4.81it/s]


https://www.pbs.org/wgbh/nova/transcripts/int_heal.html: Total of 60 links to scrape


100%|██████████| 60/60 [00:24<00:00,  2.71it/s]


https://www.pbs.org/wgbh/nova/transcripts/int_hist.html: Total of 87 links to scrape


100%|██████████| 87/87 [00:22<00:00,  4.07it/s]


https://www.pbs.org/wgbh/nova/transcripts/int_inve.html: Total of 29 links to scrape


100%|██████████| 29/29 [00:09<00:00,  3.93it/s]


https://www.pbs.org/wgbh/nova/transcripts/int_natu.html: Total of 65 links to scrape


100%|██████████| 65/65 [00:19<00:00,  4.44it/s]


https://www.pbs.org/wgbh/nova/transcripts/int_phys.html: Total of 42 links to scrape


100%|██████████| 42/42 [00:10<00:00,  4.35it/s]


https://www.pbs.org/wgbh/nova/transcripts/int_spac.html: Total of 35 links to scrape


100%|██████████| 35/35 [00:09<00:00,  4.48it/s]


https://www.pbs.org/wgbh/nova/transcripts/int_tech.html: Total of 94 links to scrape


100%|██████████| 94/94 [00:27<00:00,  2.77it/s]


In [57]:
def get_transcript(transcript_urls):
    result = ""
    for single_url in tqdm(transcript_urls):
        single_page = requests.get(single_url)
        single_soup = BeautifulSoup(single_page.content, 'html.parser')

        # Remove unwanted data (note the comma!)
        to_remove = single_soup.select('#d_feature_copy .caption,' +  # remove caption
                                       '#d_feature_copy h2 ~ p') # remove credits
        for item_to_remove in to_remove:
            item_to_remove.extract()

        for line in single_soup.select('#d_feature_copy p'):
            result += " ".join(line.text.split()) + "\n"
    return result

In [61]:
def by_year():
    URL_BASE = 'https://www.pbs.org'
    urls = ['https://www.pbs.org/wgbh/nova/transcripts/year_1995.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_1996.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_1997.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_1998.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_1999.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_2000.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_2001.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_2002.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_2003.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_2004.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_2005.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_2006.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_2007.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_2008.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_2009.html',
            'https://www.pbs.org/wgbh/nova/transcripts/year_2010.html'
           ]
    result = ""
    for URL in urls:    
        page = requests.get(URL)

        soup = BeautifulSoup(page.content, 'html.parser')

        # soup.find() is one interface
        # soup.select uses css selection which I'm more familiar with
        transcript_urls = [URL_BASE + a['href'] for a in soup.select('#d_feature_copy p a')]
        print(f"{URL}: Total of {len(transcript_urls)} links to scrape")
        result += get_transcript(transcript_urls) + "\n"
        
    with open("data/nova_ALL_transcripts.txt", 'w', encoding="UTF") as f:
        f.write(result)
        
by_year()

https://www.pbs.org/wgbh/nova/transcripts/year_1995.html: Total of 1 links to scrape


100%|██████████| 1/1 [00:00<00:00,  4.37it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_1996.html: Total of 9 links to scrape


100%|██████████| 9/9 [00:01<00:00,  5.16it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_1997.html: Total of 33 links to scrape


100%|██████████| 33/33 [00:06<00:00,  5.18it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_1998.html: Total of 19 links to scrape


100%|██████████| 19/19 [00:06<00:00,  5.09it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_1999.html: Total of 19 links to scrape


100%|██████████| 19/19 [00:03<00:00,  4.88it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_2000.html: Total of 21 links to scrape


100%|██████████| 21/21 [00:04<00:00,  4.73it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_2001.html: Total of 20 links to scrape


100%|██████████| 20/20 [00:04<00:00,  4.87it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_2002.html: Total of 15 links to scrape


100%|██████████| 15/15 [00:03<00:00,  4.84it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_2003.html: Total of 15 links to scrape


100%|██████████| 15/15 [00:03<00:00,  4.83it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_2004.html: Total of 14 links to scrape


100%|██████████| 14/14 [00:03<00:00,  4.85it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_2005.html: Total of 18 links to scrape


100%|██████████| 18/18 [00:04<00:00,  4.33it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_2006.html: Total of 19 links to scrape


100%|██████████| 19/19 [00:04<00:00,  4.60it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_2007.html: Total of 18 links to scrape


100%|██████████| 18/18 [00:06<00:00,  3.24it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_2008.html: Total of 23 links to scrape


100%|██████████| 23/23 [00:07<00:00,  3.52it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_2009.html: Total of 20 links to scrape


100%|██████████| 20/20 [00:37<00:00,  2.24it/s]


https://www.pbs.org/wgbh/nova/transcripts/year_2010.html: Total of 6 links to scrape


100%|██████████| 6/6 [00:01<00:00,  4.30it/s]
