In [26]:
# import statements for cells
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
from time import time

In [27]:
def extract_season_num(season_text):
    season_num = -1
    if season_text and len(season_text) > 0:
        last_space = season_text.rfind(' ')
        if last_space > -1:
            season_num_text = season_text[last_space+1:]
            if season_num_text.isnumeric():
                season_num = int(season_num_text)
    
    return season_num

def extract_ep_info(ep_text):
    ep_num = -1
    ep_title = None
    if ep_text and len(ep_text) > 0:
        first_dot = ep_text.find('.')
        if first_dot > -1 and first_dot < len(ep_text)-1:
            ep_num_text = ep_text[:first_dot]
            if ep_num_text.strip().isnumeric():
                ep_num = int(ep_num_text)
                ep_title = ep_text[first_dot+1:].strip()
    
    return ep_num, ep_title

def clean_script(text):
    """Replace any white space with a single space, and fix up backslashes.
    """
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.replace("\'", "'")
    
    return text

In [28]:
# Test the methods above

assert(extract_season_num('Season 1') == 1)
assert(extract_season_num('Season 23') == 23)
assert(extract_season_num('Season43') == -1)
assert(extract_season_num('asdfasdas werqwq') == -1)

assert(extract_ep_info('1. Some episode') == (1, 'Some episode'))
assert(extract_ep_info('2.NoSpaceForSomeReason') == (2, 'NoSpaceForSomeReason'))
assert(extract_ep_info('Someone. messed up!') == (-1, None))

In [29]:
# Import the raw scripts from "Springfield, Springfield!" site
script_dl_start = time()

script_base_url = 'https://www.springfieldspringfield.co.uk'
episodes = {}

# Load up the main episodes page
eps_pg = urlopen(script_base_url + '/episode_scripts.php?tv-show=the-simpsons')
eps_pg_soup = BeautifulSoup(eps_pg, 'html.parser')

season_divs = eps_pg_soup.select('div.season-episodes')
# Loop through every season and get the links to the episodes
for season_div in season_divs:
    season_name = season_div.find('h3').get_text()
    season_num = extract_season_num(season_name)
    episodes[season_num] = {}
    ep_links = season_div.select('a.season-episode-title')
    print('Downloading Season {season_num}...'.format(season_num=season_num))
    
    # Loop through all the episodes per season and save the script text
    for ep_link in ep_links:
        ep_info = ep_link.get_text()
        ep_num, ep_title = extract_ep_info(ep_info)
        episodes[season_num][ep_num] = {}
        script_link = ep_link['href']
        script_pg = urlopen(script_base_url + '/' + script_link)
        script_pg_soup = BeautifulSoup(script_pg)
        script_text_raw = script_pg_soup.select('div.scrolling-script-container')[0].get_text()
        script_text_clean = clean_script(script_text_raw)
        episodes[season_num][ep_num]['script'] = script_text_clean
        episodes[season_num][ep_num]['title'] = ep_title
        
elapsed_time = time() - script_dl_start
print('Time: ' + str(elapsed_time))

Downloading Season 1...
Downloading Season 2...
Downloading Season 3...
Downloading Season 4...
Downloading Season 5...
Downloading Season 6...
Downloading Season 7...
Downloading Season 8...
Downloading Season 9...
Downloading Season 10...
Downloading Season 11...
Downloading Season 12...
Downloading Season 13...
Downloading Season 14...
Downloading Season 15...
Downloading Season 16...
Downloading Season 17...
Downloading Season 18...
Downloading Season 19...
Downloading Season 20...
Downloading Season 21...
Downloading Season 22...
Downloading Season 23...
Downloading Season 24...
Downloading Season 25...
Downloading Season 26...
Downloading Season 27...
Downloading Season 28...
Downloading Season 29...
Downloading Season 30...
Time: 665.754105091095


In [30]:
# Import episode summaries from pogdesign TV Calendar
summary_dl_task_start = time()
ep_summary_url_format = 'https://www.pogdesign.co.uk/cat/The-Simpsons/Season-{season_num}/Episode-{ep_num}'

for season_num in episodes:
    print('Downloading summaries for Season {season_num}...'.format(season_num=season_num))
    for ep_num in episodes[season_num]:
        try:
            summary_pg = urlopen(ep_summary_url_format.format(season_num=season_num, ep_num=ep_num))
            summary_pg_soup = BeautifulSoup(summary_pg, 'html.parser')
            ep_summary = summary_pg_soup.find('p', class_='sumtext').get_text()
            episodes[season_num][ep_num]['summary'] = clean_script(ep_summary)
        except Exception as e:
            print('\tError at Season {season_num} episode {ep_num}: {error}'.format(season_num=season_num, ep_num=ep_num, error=str(e)))
            continue

elapsed_time = time() - summary_dl_task_start
print('Time: {elapsed_time}'.format(elapsed_time=str(elapsed_time)))

Downloading summaries for Season 1...
Downloading summaries for Season 2...
Downloading summaries for Season 3...
Downloading summaries for Season 4...
Downloading summaries for Season 5...
Downloading summaries for Season 6...
Downloading summaries for Season 7...
Downloading summaries for Season 8...
Downloading summaries for Season 9...
Downloading summaries for Season 10...
Downloading summaries for Season 11...
Downloading summaries for Season 12...
Downloading summaries for Season 13...
Downloading summaries for Season 14...
Downloading summaries for Season 15...
Downloading summaries for Season 16...
Downloading summaries for Season 17...
Downloading summaries for Season 18...
Downloading summaries for Season 19...
Downloading summaries for Season 20...
Downloading summaries for Season 21...
Downloading summaries for Season 22...
Downloading summaries for Season 23...
Downloading summaries for Season 24...
Downloading summaries for Season 25...
Downloading summaries for Season 2

In [31]:
# Save the episode data for use later

import pickle

with open('simpsons_scripts.pickle', 'wb') as eps_file:
    pickle.dump(episodes, eps_file, protocol=pickle.HIGHEST_PROTOCOL)