In [3]:
import pickle
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import string

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [4]:
show_urls = []
letters = list(string.ascii_lowercase)
letters.append('19')
for letter in letters:
    letter_html = simple_get('https://www.allmusicals.com/' + letter + '.htm')
    letter_soup = BeautifulSoup(letter_html, 'html.parser')
    for a in letter_soup.find_all('a'):
        show_link = a.get('href')
        if "/" + letter + "/" in show_link:
            show_urls.append(show_link)

In [5]:
show_urls

['/a/adayinhollywoodanightintheukraine.htm',
 '/a/ace.htm',
 '/a/acrosstheuniverse.htm',
 '/a/actthe.htm',
 '/a/addamsfamilythe.htm',
 '/a/addingmachine.htm',
 '/a/adriftinmacao.htm',
 '/a/adventuresoftomsawyerthe.htm',
 '/a/aida.htm',
 '/a/aintmisbehavin.htm',
 '/a/ainttooproud.htm',
 '/a/aladdin.htm',
 '/a/alasalackzorrosback.htm',
 '/a/allshookup.htm',
 '/a/allegro.htm',
 '/a/altarboyz.htm',
 '/a/americanidiot.htm',
 '/a/americaninparisan.htm',
 '/a/americanmall.htm',
 '/a/anastasia.htm',
 '/a/andrewlloydwebberdivas.htm',
 '/a/annakarenina.htm',
 '/a/annie.htm',
 '/a/anniegetyourgun.htm',
 '/a/anyonecanwhistle.htm',
 '/a/anythinggoes.htm',
 '/a/applause.htm',
 '/a/appletreethe.htm',
 '/a/arkthe.htm',
 '/a/asthousandscheer.htm',
 '/a/aspectsoflove.htm',
 '/a/assassins.htm',
 '/a/avenueq.htm',
 '/a/aladdin.htm',
 '/b/babesinarms.htm',
 '/b/baby.htm',
 '/b/badgirls.htm',
 '/b/bajour.htm',
 '/b/bakerswifethe.htm',
 '/b/bandsvisit.htm',
 '/b/bandstand.htm',
 '/b/bareapopopera.htm',
 '/b/

In [6]:
word_titles = {}
for show_url in show_urls:
    page_html = simple_get('https://www.allmusicals.com' + show_url)
    page_soup = BeautifulSoup(page_html, 'html.parser')
    title_url = show_url[show_url.rfind('/')+1:show_url.rfind('.')]
    word_titles[title_url] = page_soup.title.text[:page_soup.title.text.index("Lyrics")-1]

In [7]:
word_titles

{'110intheshade': '110 in the Shade',
 '13': '13',
 '1776': '1776',
 '25thannualputnamcountyspellingbee': '25th Annual Putnam County Spelling Bee',
 '42ndstreet': '42nd Street',
 '70girls70': '70, Girls, 70',
 '9to5': '9 to 5',
 'ace': 'Ace',
 'acrosstheuniverse': 'Across the Universe',
 'actthe': 'Act, The',
 'adayinhollywoodanightintheukraine': 'A Day in Hollywood / A Night in the Ukraine',
 'addamsfamilythe': 'Addams Family, The',
 'addingmachine': 'Adding Machine',
 'adriftinmacao': 'Adrift In Macao',
 'adventuresoftomsawyerthe': 'Adventures of Tom Sawyer, The',
 'aida': 'Aida',
 'aintmisbehavin': "Ain't Misbehavin'",
 'ainttooproud': "Ain't Too Proud",
 'aladdin': 'Aladdin',
 'alasalackzorrosback': "Alas! Alack! Zorro's Back!",
 'allegro': 'Allegro',
 'allshookup': 'All Shook Up',
 'altarboyz': 'Altar Boyz',
 'americanidiot': 'American Idiot',
 'americaninparisan': 'American in Paris, An',
 'americanmall': 'American Mall',
 'anastasia': 'Anastasia',
 'andrewlloydwebberdivas': 'And

In [None]:
descriptions = {}
for show in word_titles.keys():
    descriptions[word_titles[show]] = []
    review_html = simple_get('https://www.allmusicals.com/lyrics/' + show + '/review.htm')
    if review_html is not None:
        review_soup = BeautifulSoup(review_html, 'html.parser')
        #print(cast_soup)
        if review_soup != []:
            desc = review_soup.findAll('div', attrs = {'id':'page'})
            for s in desc[0].stripped_strings:
                descriptions[word_titles[show]].append(s)

In [17]:
len(descriptions)

473

In [18]:
for key in descriptions:
    if descriptions[key] == []:
        print(key)

I Sing
Prom Night
Civil War: The Complete Work, The
Ark, The


In [28]:
for key in descriptions:
    if descriptions[key] != []:
        descriptions[key] = descriptions[key][1:len(descriptions[key])-1]

In [37]:
for key in descriptions:
    if descriptions[key] != [] and len(descriptions[key]) > 1:
        descriptions[key] = " ".join(descriptions[key])

In [39]:
f = open("show_descriptions.pkl","wb")
pickle.dump(descriptions,f)
f.close()

In [None]:
#image scraping

In [7]:
import urllib.request as urllib2

import re

import os

from os.path import basename

from urllib.parse import urlsplit

from urllib.parse import urlparse

from posixpath import basename,dirname

In [8]:
os.mkdir('images')

In [41]:
for show_url in show_urls[408:]:
    url = "https://www.allmusicals.com" + show_url
    show_html = simple_get(url)
    show_soup = BeautifulSoup(show_html, 'html.parser')
    img_tags = show_soup.findAll('img', {'src':re.compile('.jpg')})

    img = str(img_tags[0])
    img = img[img.find("src=")+5:img.find(".jpg")+4]
    imgurl = "https://www.allmusicals.com" + img
    imgdata=urllib2.urlopen(imgurl).read()
    filname= 'images/'+ show_url[show_url.rfind("/")+1:show_url.rfind(".htm")] + ".jpg"
    output=open(filname,'wb')
    output.write(imgdata)
    output.close()

In [None]:
#Dictionary combining

In [1]:
word_titles

NameError: name 'word_titles' is not defined

In [19]:
title_to_url = {show.strip():url for url,show in word_titles.items()}

In [20]:
title_to_url

{'110 in the Shade': '110intheshade',
 '13': '13',
 '1776': '1776',
 '25th Annual Putnam County Spelling Bee': '25thannualputnamcountyspellingbee',
 '42nd Street': '42ndstreet',
 '70, Girls, 70': '70girls70',
 '9 to 5': '9to5',
 'A Day in Hollywood / A Night in the Ukraine': 'adayinhollywoodanightintheukraine',
 'Ace': 'ace',
 'Across the Universe': 'acrosstheuniverse',
 'Act, The': 'actthe',
 'Addams Family, The': 'addamsfamilythe',
 'Adding Machine': 'addingmachine',
 'Adrift In Macao': 'adriftinmacao',
 'Adventures of Tom Sawyer, The': 'adventuresoftomsawyerthe',
 'Aida': 'aida',
 "Ain't Misbehavin'": 'aintmisbehavin',
 "Ain't Too Proud": 'ainttooproud',
 'Aladdin': 'aladdin',
 "Alas! Alack! Zorro's Back!": 'alasalackzorrosback',
 'All Shook Up': 'allshookup',
 'Allegro': 'allegro',
 'Altar Boyz': 'altarboyz',
 'American Idiot': 'americanidiot',
 'American Mall': 'americanmall',
 'American in Paris, An': 'americaninparisan',
 'Anastasia': 'anastasia',
 'Andrew Lloyd Webber Divas': '

In [10]:
bd = pickle.load(open('broadway_lyrics_v4.pkl', 'rb'))
descs = pickle.load(open('show_descriptions.pkl', 'rb'))

In [25]:
descs2 = {}
for key in descs:
    descs2[key.strip()] = descs[key]

In [29]:
for key in bd:
    bd[key]['description'] = descs2[key]
    bd[key]['img_name'] = 'app/static/images/' + title_to_url[key] + '.jpg'

In [32]:
bd['Dear Evan Hansen']

{'composer': 'Benj Pasek, Justin Paul',
 'currently_playing': True,
 'description': 'Watching “Dear Evan Hansen,” now playing at Off Broadway’s Second Stage Theater, is like falling down a rabbit hole that leads back to high school. Steven Levenson’s book for this bittersweet musical captures both the humor and the pathos of a hopeless misfit (played to perfection by Ben Platt) who achieves popularity through no fault of his own. And not since “Next to Normal” has a score (by Benj Pask and Justin Paul) tapped so deeply into the psyches of its troubled characters. Credit director Michael Greif, who also directed the original production at Arena Stage, for the sensitive tone of comic gravitas. Although he’s made it to senior year, Evan Hansen (Platt) suffers from the kind of debilitating anxiety that demands regular shrink visits and many pills. Platt (who appears as Benji in both “Pitch Perfect” movies) gives a carefully choreographed physical performance that makes the kid’s discomfort

In [33]:
f = open("broadway_lyrics_v5.pkl","wb")
pickle.dump(bd,f)
f.close()

In [43]:
printable_names = {}
for key in bd:
    if key.endswith(', The'):
        printable_names[key] = 'The ' + key[:key.rfind(',')]
    elif key.endswith(', An'):
        printable_names[key] = 'An ' + key[:key.rfind(',')]
    elif key.endswith(', A'):
        printable_names[key] = 'A ' + key[:key.rfind(',')]
    else:
        printable_names[key] = key

In [48]:
printable_names

{'110 in the Shade': '110 in the Shade',
 '13': '13',
 '1776': '1776',
 '25th Annual Putnam County Spelling Bee': '25th Annual Putnam County Spelling Bee',
 '42nd Street': '42nd Street',
 '70, Girls, 70': '70, Girls, 70',
 '9 to 5': '9 to 5',
 'A Day in Hollywood / A Night in the Ukraine': 'A Day in Hollywood / A Night in the Ukraine',
 'Ace': 'Ace',
 'Across the Universe': 'Across the Universe',
 'Act, The': 'The Act',
 'Addams Family, The': 'The Addams Family',
 'Adding Machine': 'Adding Machine',
 'Adrift In Macao': 'Adrift In Macao',
 'Adventures of Tom Sawyer, The': 'The Adventures of Tom Sawyer',
 'Aida': 'Aida',
 "Ain't Misbehavin'": "Ain't Misbehavin'",
 "Ain't Too Proud": "Ain't Too Proud",
 'Aladdin': 'Aladdin',
 "Alas! Alack! Zorro's Back!": "Alas! Alack! Zorro's Back!",
 'All Shook Up': 'All Shook Up',
 'Allegro': 'Allegro',
 'Altar Boyz': 'Altar Boyz',
 'American Idiot': 'American Idiot',
 'American Mall': 'American Mall',
 'American in Paris, An': 'An American in Paris',


In [50]:
backend_to_proper = printable_names
proper_to_backend = {y:x for x,y in backend_to_proper.items()}

In [53]:
import json

In [54]:
with open('backend_to_proper.json', 'w') as fp:
    json.dump(backend_to_proper, fp)
with open('proper_to_backend.json', 'w') as fp:
    json.dump(proper_to_backend, fp)