In [1]:
from bs4 import BeautifulSoup
import requests
import calendar
import datetime
import contextlib
import os
from IPython.display import clear_output
import numpy as np

In [2]:
# Helper: displays progress bar

def jupyter_progress(percentage):
    clear_output(wait=True)
    bar_length = 20
    block = int(round(percentage * bar_length))
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), percentage * 100)
    print(text)

In [3]:
# Helper: Given a filename and a function, only runs the function if the file does not exist and then
# writes the return value to that file. Afterwards, returns the content of that file.

def load_maybe_file(generator, filename, binary=False):
    binflag = 'b' if binary else ''
    
    if not os.path.exists(filename):
        data = generator()
        if data is not None:
            with open(filename, 'w'+binflag) as f:
                f.write(data)
    with open(filename, 'r'+binflag) as f:
        return f.read()

In [4]:
# Helper: processes a HTTP request
def http_request(url):
    try:
        with contextlib.closing(requests.get(url, stream=True)) as resp:
            content_type = resp.headers['Content-Type'].lower()
            is_good_response = (resp.status_code == 200 and content_type is not None)
            if is_good_response:
                return resp.content
            else:
                return None
    except requests.RequestException as e:
        print("ERROR!", str(e))
        return None

In [5]:
# For a given day (as datetime.date object), finds top 48 celebrities (by id name) with that birthday.

def get_top_day_html(date):
    day_string = calendar.month_name[date.month].lower() + str(date.day)
    url = 'https://www.famousbirthdays.com/{}.html'.format(day_string)
    return http_request(url)
    
def get_top_day_celebrities(date):
    html = get_day_html(date)
    bs = BeautifulSoup(html, 'html.parser')

    for a in bs.select('a'):
        if 'class' in a.attrs and a['class'] == ['face', 'person-item']:
            yield a['href'].replace('https://www.famousbirthdays.com/people/', '').replace('.html', '')

In [6]:
def generate_celeb_ids():
    # Collect all days in a year.
    dates = []
    for mon in range(1, 13):
        for day in range(1, 32):
            try:
                dates.append(datetime.date(datetime.MINYEAR, mon, day))
            except ValueError:
                pass

    # Collect all top celebrities from all days.
    celebs = []
    i = 0.0
    for date in dates:
        i += 1
        jupyter_progress(i / len(dates))
        celebs += list(get_top_day_celebrities(date))
        
    "\n".join(celebs)

In [7]:
celeb_ids = load_maybe_file(generate_celeb_ids, 'data/celeb_ids.txt').split("\n")

len(celeb_ids)

17520

In [8]:
# given a celebrity ID (e.g. chase-rice), scrapes the page 
# (https://www.famousbirthdays.com/people/chase-rice.html) for all photos and returns their URLs.

def get_person_page_html(id):
    url = 'https://www.famousbirthdays.com/people/{}.html'.format(id)
    return http_request(url)

def scrape_photo_urls(id):
    html = get_person_page_html(id)
    bs = BeautifulSoup(html, 'html.parser')
    image_tags = bs.find_all('img')
    for img in image_tags:
        if 'src' in img.attrs:
            src = img['src']
            if 'faces' in src or 'headshots' in src:
                yield src

In [9]:
def generate_celeb_photos():
    result = []
    i = 0.0
    for celeb in celeb_ids:
        i += 1
        jupyter_progress(i / len(celeb_ids))
        print(celeb)
        result.append("\n".join(scrape_photo_urls(celeb)))
    return "\n\n".join(result)

In [10]:
celeb_photos = load_maybe_file(generate_celeb_photos, 'data/celeb_photos.txt')

In [11]:
# Load photo URLs

bad_images = set(['https://www.famousbirthdays.com/faces/large-default.jpg'])

photo_urls_faces = [] # main picture
photo_urls_headshots = [] # secondary pictures
for person in celeb_photos.split("\n\n"):
    for photo in person.split("\n"):
        if photo not in bad_images:
            if 'https://www.famousbirthdays.com/faces/' in photo:
                photo_urls_faces.append(photo)
            else:
                photo_urls_headshots.append(photo)
   
            
            
print(len(photo_urls_faces))
print(len(photo_urls_headshots))


15339
67503


In [12]:
photo_urls_faces[0]

'https://www.famousbirthdays.com/faces/poppy-that-image.jpg'

In [13]:
def download_photo(url):
    photo_name = url.split('/')[-1]
    with open('photos/' + photo_name, 'wb') as f:
        f.write(http_request(url))

try:
    os.mkdir("photos")
except FileExistsError:
    pass

try:
    os.mkdir("photos/faces")
except FileExistsError:
    pass

try:
    os.mkdir("photos/headshots")
except FileExistsError:
    pass

In [14]:
# Download all photos as images to ./photo/
i = 0.0
for p in photo_urls_faces:
    i += 1
    jupyter_progress(i / len(photo_urls_faces))
    try:
        load_maybe_file(lambda: http_request(p), 'photos/faces/' + p.split('/')[-1], binary=True)
    except FileNotFoundError:
        pass

Progress: [####################] 100.0%


In [None]:
i = 0.0
for p in photo_urls_headshots:
    i += 1
    jupyter_progress(i / len(photo_urls_headshots))
    load_maybe_file(lambda: http_request(p), 'photos/headshots/' + p.split('/')[-1], binary=True)