In [1]:
import pickle
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import string

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [2]:
show_urls = []
letters = list(string.ascii_lowercase)
letters.append('19')
for letter in letters:
    letter_html = simple_get('https://www.allmusicals.com/' + letter + '.htm')
    letter_soup = BeautifulSoup(letter_html, 'html.parser')
    for a in letter_soup.find_all('a'):
        show_link = a.get('href')
        if "/" + letter + "/" in show_link:
            show_urls.append(show_link)

['/a/adayinhollywoodanightintheukraine.htm',
 '/a/ace.htm',
 '/a/acrosstheuniverse.htm',
 '/a/actthe.htm',
 '/a/addamsfamilythe.htm',
 '/a/addingmachine.htm',
 '/a/adriftinmacao.htm',
 '/a/adventuresoftomsawyerthe.htm',
 '/a/aida.htm',
 '/a/aintmisbehavin.htm',
 '/a/ainttooproud.htm',
 '/a/aladdin.htm',
 '/a/alasalackzorrosback.htm',
 '/a/allshookup.htm',
 '/a/allegro.htm',
 '/a/altarboyz.htm',
 '/a/americanidiot.htm',
 '/a/americaninparisan.htm',
 '/a/americanmall.htm',
 '/a/anastasia.htm',
 '/a/andrewlloydwebberdivas.htm',
 '/a/annakarenina.htm',
 '/a/annie.htm',
 '/a/anniegetyourgun.htm',
 '/a/anyonecanwhistle.htm',
 '/a/anythinggoes.htm',
 '/a/applause.htm',
 '/a/appletreethe.htm',
 '/a/arkthe.htm',
 '/a/asthousandscheer.htm',
 '/a/aspectsoflove.htm',
 '/a/assassins.htm',
 '/a/avenueq.htm',
 '/a/aladdin.htm',
 '/b/babesinarms.htm',
 '/b/baby.htm',
 '/b/badgirls.htm',
 '/b/bajour.htm',
 '/b/bakerswifethe.htm',
 '/b/bandsvisit.htm',
 '/b/bandstand.htm',
 '/b/bareapopopera.htm',
 '/b/

In [3]:
word_titles = {}
for show_url in show_urls:
    page_html = simple_get('https://www.allmusicals.com' + show_url)
    page_soup = BeautifulSoup(page_html, 'html.parser')
    title_url = show_url[show_url.rfind('/')+1:show_url.rfind('.')]
    word_titles[title_url] = page_soup.title.text[:page_soup.title.text.index("Lyrics")-1]

In [5]:
word_titles

{'110intheshade': '110 in the Shade',
 '13': '13',
 '1776': '1776',
 '25thannualputnamcountyspellingbee': '25th Annual Putnam County Spelling Bee',
 '42ndstreet': '42nd Street',
 '70girls70': '70, Girls, 70',
 '9to5': '9 to 5',
 'ace': 'Ace',
 'acrosstheuniverse': 'Across the Universe',
 'actthe': 'Act, The',
 'adayinhollywoodanightintheukraine': 'A Day in Hollywood / A Night in the Ukraine',
 'addamsfamilythe': 'Addams Family, The',
 'addingmachine': 'Adding Machine',
 'adriftinmacao': 'Adrift In Macao',
 'adventuresoftomsawyerthe': 'Adventures of Tom Sawyer, The',
 'aida': 'Aida',
 'aintmisbehavin': "Ain't Misbehavin'",
 'ainttooproud': "Ain't Too Proud",
 'aladdin': 'Aladdin',
 'alasalackzorrosback': "Alas! Alack! Zorro's Back!",
 'allegro': 'Allegro',
 'allshookup': 'All Shook Up',
 'altarboyz': 'Altar Boyz',
 'americanidiot': 'American Idiot',
 'americaninparisan': 'American in Paris, An',
 'americanmall': 'American Mall',
 'anastasia': 'Anastasia',
 'andrewlloydwebberdivas': 'And

In [None]:
descriptions = {}
for show in word_titles.keys():
    descriptions[word_titles[show]] = []
    review_html = simple_get('https://www.allmusicals.com/lyrics/' + show + '/review.htm')
    if review_html is not None:
        review_soup = BeautifulSoup(review_html, 'html.parser')
        #print(cast_soup)
        if review_soup != []:
            desc = review_soup.findAll('div', attrs = {'id':'page'})
            for s in desc[0].stripped_strings:
                descriptions[word_titles[show]].append(s)

In [17]:
len(descriptions)

473

In [18]:
for key in descriptions:
    if descriptions[key] == []:
        print(key)

I Sing
Prom Night
Civil War: The Complete Work, The
Ark, The


In [28]:
for key in descriptions:
    if descriptions[key] != []:
        descriptions[key] = descriptions[key][1:len(descriptions[key])-1]

In [37]:
for key in descriptions:
    if descriptions[key] != [] and len(descriptions[key]) > 1:
        descriptions[key] = " ".join(descriptions[key])

In [39]:
f = open("show_descriptions.pkl","wb")
pickle.dump(descriptions,f)
f.close()