In [53]:
import requests
import re
import os
import json
from bs4 import BeautifulSoup
from time import sleep
from random import normalvariate

Get the episode links

- Go to show's episodes page
    - Scrape the season dropdown for each individual season
    - Scrape the season for an array of links to each episode page
- Go to each episode page
    - Scrape the episode for the runtime
- Final array of arrays, each sub-array has:
    - season number
    - episode number
    - episode link
    - runtime (in minutes)

In [41]:
def politePause(sec):
    sleep(normalvariate(sec, sec/24))

In [146]:
def scrapeSeasons(episodes_url):
    # Scrape the given URL for an array of total seasons
    req = requests.get(episodes_url)
    soup = BeautifulSoup(req.text, 'html.parser')

    seasons = []
    raw_seasons = soup.find('select', {'id':'bySeason'}).find_all('option')
    for raw_season in raw_seasons:
        seasons.append(int(raw_season.get_text().strip()))
    
    return seasons

In [171]:
def scrapeEpisodeLinks(episodes_url, seasons, pause=2):
    # Scrape the given URL for an array of total episodes
    ep_list = []
    for season in seasons:
        # Scrape the season list
        season_url = episodes_url + "?season=" + str(season)
        req = requests.get(season_url)
        soup = BeautifulSoup(req.text, 'html.parser')

        # Find the list of links
        a_list = soup.find_all('a', {'itemprop': 'name'})

        # Build the output array
        for episode,a in enumerate(a_list):
            ep_dict = {
                "season": season, 
                "episode": episode+1,
                "link": a['href']
                }
            ep_list.append(ep_dict)
            print(ep_dict, flush=True)
            politePause(pause)
    
    return ep_list

In [177]:
def formatRuntime(raw_runtime):
    # Correct runtime from string format to int formatted minutes
    arr_runtime = raw_runtime.replace('Runtime', '').split(' ')
    if arr_runtime[1] == 'hour':
        runtime = int(arr_runtime[0])*60
    else:
        runtime = int(arr_runtime[0])
        
    return runtime

def scrapeRuntime(episode_url):
    # Scrape the provided episode URL for the runtime
    req = requests.get(episode_url)
    soup = BeautifulSoup(req.text, 'html.parser')

    # Get the runtime
    runtime = soup.find("li", {"data-testid": "title-techspec_runtime"}).get_text()

    return runtime

def scrapeRuntimes(ep_list, pause=2):
     for ep in ep_list:
        url = 'https://www.imdb.com' + ep['link']
        raw_runtime = scrapeRuntime(url)
        runtime = formatRuntime(raw_runtime)
        print(str(ep['season']) + 'x' + str(ep['episode']) + ": " + str(runtime), flush=True)
        ep['runtime'] = runtime
        


In [183]:
def main(url, filename):
    seasons = scrapeSeasons(url)
    ep_list = scrapeEpisodeLinks(url, seasons)
    scrapeRuntimes(ep_list)
    
    with open('./output/' + filename +'.json', 'w+') as outfile:
        json.dump(ep_list, outfile)

In [186]:
main('https://www.imdb.com/title/tt0238784/episodes', 'gilmoregirls')

{'season': 1, 'episode': 1, 'link': '/title/tt0931812/'}
{'season': 1, 'episode': 2, 'link': '/title/tt0588176/'}
{'season': 1, 'episode': 3, 'link': '/title/tt0588209/'}
{'season': 1, 'episode': 4, 'link': '/title/tt0588157/'}
{'season': 1, 'episode': 5, 'link': '/title/tt0588202/'}
{'season': 1, 'episode': 6, 'link': '/title/tt0588129/'}
{'season': 1, 'episode': 7, 'link': '/title/tt0588182/'}
{'season': 1, 'episode': 8, 'link': '/title/tt0588158/'}
{'season': 1, 'episode': 9, 'link': '/title/tt0588166/'}
{'season': 1, 'episode': 10, 'link': '/title/tt0588183/'}
{'season': 1, 'episode': 11, 'link': '/title/tt0588141/'}
{'season': 1, 'episode': 12, 'link': '/title/tt0588175/'}
{'season': 1, 'episode': 13, 'link': '/title/tt0588135/'}
{'season': 1, 'episode': 14, 'link': '/title/tt0588131/'}
{'season': 1, 'episode': 15, 'link': '/title/tt0588197/'}
{'season': 1, 'episode': 16, 'link': '/title/tt0588128/'}
{'season': 1, 'episode': 17, 'link': '/title/tt0588192/'}
{'season': 1, 'episode'

{'season': 7, 'episode': 12, 'link': '/title/tt0921218/'}
{'season': 7, 'episode': 13, 'link': '/title/tt0957950/'}
{'season': 7, 'episode': 14, 'link': '/title/tt0957949/'}
{'season': 7, 'episode': 15, 'link': '/title/tt0949011/'}
{'season': 7, 'episode': 16, 'link': '/title/tt0959778/'}
{'season': 7, 'episode': 17, 'link': '/title/tt0976505/'}
{'season': 7, 'episode': 18, 'link': '/title/tt0962360/'}
{'season': 7, 'episode': 19, 'link': '/title/tt0993249/'}
{'season': 7, 'episode': 20, 'link': '/title/tt1001863/'}
{'season': 7, 'episode': 21, 'link': '/title/tt1009614/'}
{'season': 7, 'episode': 22, 'link': '/title/tt1001862/'}
1x1: 44
1x2: 45
1x3: 44
1x4: 43
1x5: 44
1x6: 42
1x7: 45
1x8: 45
1x9: 45
1x10: 45
1x11: 43
1x12: 45
1x13: 42
1x14: 40
1x15: 45
1x16: 43
1x17: 45
1x18: 45
1x19: 45
1x20: 43
1x21: 44
1x22: 44
2x1: 42
2x2: 44
2x3: 44
2x4: 44
2x5: 45
2x6: 45
2x7: 45
2x8: 45
2x9: 44
2x10: 45
2x11: 44
2x12: 45
2x13: 43
2x14: 42
2x15: 42
2x16: 45
2x17: 45
2x18: 44
2x19: 45
2x20: 44
2x

In [184]:
main('https://www.imdb.com/title/tt2006848/episodes', 'bunheads')

{'season': 1, 'episode': 1, 'link': '/title/tt2342525/'}
{'season': 1, 'episode': 2, 'link': '/title/tt2166870/'}
{'season': 1, 'episode': 3, 'link': '/title/tt2166872/'}
{'season': 1, 'episode': 4, 'link': '/title/tt2162290/'}
{'season': 1, 'episode': 5, 'link': '/title/tt2177886/'}
{'season': 1, 'episode': 6, 'link': '/title/tt2228328/'}
{'season': 1, 'episode': 7, 'link': '/title/tt2209310/'}
{'season': 1, 'episode': 8, 'link': '/title/tt2257678/'}
{'season': 1, 'episode': 9, 'link': '/title/tt2297452/'}
{'season': 1, 'episode': 10, 'link': '/title/tt2310468/'}
{'season': 1, 'episode': 11, 'link': '/title/tt2482916/'}
{'season': 1, 'episode': 12, 'link': '/title/tt2614940/'}
{'season': 1, 'episode': 13, 'link': '/title/tt2626494/'}
{'season': 1, 'episode': 14, 'link': '/title/tt2633406/'}
{'season': 1, 'episode': 15, 'link': '/title/tt2573908/'}
{'season': 1, 'episode': 16, 'link': '/title/tt2633112/'}
{'season': 1, 'episode': 17, 'link': '/title/tt2633128/'}
{'season': 1, 'episode'

In [185]:
main('https://www.imdb.com/title/tt5788792/episodes', 'maisel')

{'season': 1, 'episode': 1, 'link': '/title/tt6975526/'}
{'season': 1, 'episode': 2, 'link': '/title/tt7113570/'}
{'season': 1, 'episode': 3, 'link': '/title/tt6976082/'}
{'season': 1, 'episode': 4, 'link': '/title/tt6997106/'}
{'season': 1, 'episode': 5, 'link': '/title/tt7152106/'}
{'season': 1, 'episode': 6, 'link': '/title/tt7086346/'}
{'season': 1, 'episode': 7, 'link': '/title/tt7212460/'}
{'season': 1, 'episode': 8, 'link': '/title/tt7267456/'}
{'season': 2, 'episode': 1, 'link': '/title/tt7667344/'}
{'season': 2, 'episode': 2, 'link': '/title/tt7690008/'}
{'season': 2, 'episode': 3, 'link': '/title/tt7772296/'}
{'season': 2, 'episode': 4, 'link': '/title/tt7772302/'}
{'season': 2, 'episode': 5, 'link': '/title/tt7775670/'}
{'season': 2, 'episode': 6, 'link': '/title/tt7775674/'}
{'season': 2, 'episode': 7, 'link': '/title/tt7775676/'}
{'season': 2, 'episode': 8, 'link': '/title/tt7667348/'}
{'season': 2, 'episode': 9, 'link': '/title/tt7978458/'}
{'season': 2, 'episode': 10, 'l

In [None]:
## Manual run

In [169]:
episodes_url = 'https://www.imdb.com/title/tt0238784/episodes'

In [170]:
seasons = scrapeSeasons(episodes_url)
print(seasons)

[1, 2, 3, 4, 5, 6, 7]


In [172]:
ep_list = scrapeEpisodeLinks(episodes_url, seasons)

{'season': 1, 'episode': 1, 'link': '/title/tt0931812/'}
{'season': 1, 'episode': 2, 'link': '/title/tt0588176/'}
{'season': 1, 'episode': 3, 'link': '/title/tt0588209/'}
{'season': 1, 'episode': 4, 'link': '/title/tt0588157/'}
{'season': 1, 'episode': 5, 'link': '/title/tt0588202/'}
{'season': 1, 'episode': 6, 'link': '/title/tt0588129/'}
{'season': 1, 'episode': 7, 'link': '/title/tt0588182/'}
{'season': 1, 'episode': 8, 'link': '/title/tt0588158/'}
{'season': 1, 'episode': 9, 'link': '/title/tt0588166/'}
{'season': 1, 'episode': 10, 'link': '/title/tt0588183/'}
{'season': 1, 'episode': 11, 'link': '/title/tt0588141/'}
{'season': 1, 'episode': 12, 'link': '/title/tt0588175/'}
{'season': 1, 'episode': 13, 'link': '/title/tt0588135/'}
{'season': 1, 'episode': 14, 'link': '/title/tt0588131/'}
{'season': 1, 'episode': 15, 'link': '/title/tt0588197/'}
{'season': 1, 'episode': 16, 'link': '/title/tt0588128/'}
{'season': 1, 'episode': 17, 'link': '/title/tt0588192/'}
{'season': 1, 'episode'

{'season': 7, 'episode': 12, 'link': '/title/tt0921218/'}
{'season': 7, 'episode': 13, 'link': '/title/tt0957950/'}
{'season': 7, 'episode': 14, 'link': '/title/tt0957949/'}
{'season': 7, 'episode': 15, 'link': '/title/tt0949011/'}
{'season': 7, 'episode': 16, 'link': '/title/tt0959778/'}
{'season': 7, 'episode': 17, 'link': '/title/tt0976505/'}
{'season': 7, 'episode': 18, 'link': '/title/tt0962360/'}
{'season': 7, 'episode': 19, 'link': '/title/tt0993249/'}
{'season': 7, 'episode': 20, 'link': '/title/tt1001863/'}
{'season': 7, 'episode': 21, 'link': '/title/tt1009614/'}
{'season': 7, 'episode': 22, 'link': '/title/tt1001862/'}


In [173]:
## Adjust EP list - There's an unaired pilot that needs to be removed from season 1
# Adjust episode numbers for season 1
for ep in ep_list:
    if ep['season']==1:
        ep['episode']=ep['episode']-1
# Remove unaired pilot
ep_list.pop(0)

{'season': 1, 'episode': 0, 'link': '/title/tt0931812/'}

In [176]:
scrapeRuntimes(ep_list)

7x22: 60

In [178]:
ep_list

[{'season': 1, 'episode': 1, 'link': '/title/tt0588176/', 'runtime': 45},
 {'season': 1, 'episode': 2, 'link': '/title/tt0588209/', 'runtime': 44},
 {'season': 1, 'episode': 3, 'link': '/title/tt0588157/', 'runtime': 43},
 {'season': 1, 'episode': 4, 'link': '/title/tt0588202/', 'runtime': 44},
 {'season': 1, 'episode': 5, 'link': '/title/tt0588129/', 'runtime': 42},
 {'season': 1, 'episode': 6, 'link': '/title/tt0588182/', 'runtime': 45},
 {'season': 1, 'episode': 7, 'link': '/title/tt0588158/', 'runtime': 45},
 {'season': 1, 'episode': 8, 'link': '/title/tt0588166/', 'runtime': 45},
 {'season': 1, 'episode': 9, 'link': '/title/tt0588183/', 'runtime': 45},
 {'season': 1, 'episode': 10, 'link': '/title/tt0588141/', 'runtime': 43},
 {'season': 1, 'episode': 11, 'link': '/title/tt0588175/', 'runtime': 45},
 {'season': 1, 'episode': 12, 'link': '/title/tt0588135/', 'runtime': 42},
 {'season': 1, 'episode': 13, 'link': '/title/tt0588131/', 'runtime': 40},
 {'season': 1, 'episode': 14, 'lin

In [182]:
with open('./output/gg_runtimes.json', 'w+') as outfile:
        json.dump(ep_list, outfile)