Project 4 "Fletcher" Notebook 1:  
**Scraping data and loading into MongoDB**  
  
Adam Flugel  
Metis Boot Camp, Chicago  
Winter 2018

In [6]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import pickle
import re
import os
import string
from pymongo import MongoClient

# Individual monologue page scraper

In [122]:
def get_soup(url):
    """Takes a url (string) and returns a BeautifulSoup object"""
    response = requests.get(url)
    
    assert (response.status_code == 200), "Problem with url request! %s throws %s" % (url, response.status_code)   #checking that it worked
    
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup


def get_monologue(soup):
    """Takes a BeautifulSoup object of an individual monologue page and returns just the
    text of the monologue itself. Removes stage directions and character name."""
    monologue_box = soup.find(class_='monologue')
    raw_text = monologue_box.get_text(' ')
    
    clean_text = re.sub(r'[A-Z]*: ','', raw_text) #remove character name from beginning of monologue
    clean_text = re.sub(r'\[[^()]*?\]', '', clean_text) #remove stage directions
    
    return clean_text


def get_title(soup):
    """Takes a BeautifulSoup object of an individual monologue page and returns the title of the work
    from which the monologue originated"""
    header = soup.find(class_="row body").h2
    raw_title = header.text
    
    clean_title = string.capwords(raw_title)
    
    return clean_title 
    
    
def get_author(soup):
    """Takes a BeautifulSoup object of an individual monologue page and returns the author"""
    header = soup.find(class_="row body").h4
    raw_text = header.text 
    
    #the header is formatted differently if the author is anonymous
    if raw_text == 'An anonymous monologue':
        return 'Anonymous'
    
    else:
        clean_text = re.sub(r'.* by ','', raw_text)
        return clean_text
    
    
def parse_monologue_page(url):
    """Takes a string url of an individual monologue page and returns a dictionary
    with the monologue text, the author, and the play title"""
    
    soup = get_soup(url)
    
    monologue_info = defaultdict()     #Dictionary to hold the info for this game
    
    monologue_info['text'] = get_monologue(soup)
    monologue_info['title'] = get_title(soup)
    monologue_info['author'] = get_author(soup)
    
    return monologue_info

# Scrape a set of urls for all the monologues on MonologueArchive

In [107]:
def get_urls(url):
    """Takes the url for a list of monologues on monologuearchive.com
    and returns a set of urls for every individual monologue page in that list.
    
    input:
    url = url for a list of monologues on monologuearchive.com
    
    returns:
    set of urls for each individual monologue from the list page"""

    urls = set()
    
    soup = get_soup(url)
    
    tables = soup.find_all(class_='col-md-6')

    for table in tables:
        link_tags = table.find_all('a')

        for tag in link_tags:
            url_tail = tag['href']
            url = url_tail.replace('..', 'http://www.monologuearchive.com')

            urls.add(url)

    return urls

In [81]:
#create a set of monologue urls for each category
#also creates a master list with each unique monologue url

url_master = set()
comic_men = get_urls('http://www.monologuearchive.com/comic_men.html')
url_master.update(comic_men)
drama_men = get_urls('http://www.monologuearchive.com/dramatic_men.html')
url_master.update(drama_men)
classic_men = get_urls('http://www.monologuearchive.com/classical_men.html')
url_master.update(classic_men)
seniors = get_urls('http://www.monologuearchive.com/seniors.html')
url_master.update(seniors)
comic_women = get_urls('http://www.monologuearchive.com/comic_women.html')
url_master.update(comic_women)
drama_women = get_urls('http://www.monologuearchive.com/dramatic_women.html')
url_master.update(drama_women)
classic_women = get_urls('http://www.monologuearchive.com/classical_women.html')
url_master.update(classic_women)
children = get_urls('http://www.monologuearchive.com/children.html')
url_master.update(children)

# Scrape and data for all monologues on MonologueArchive and add to MongoDB

In [124]:
#connect to mongo and create the collection
client = MongoClient(port=12345)
db = client.new_dank_database

collection = db.monologues

In [125]:
document_list = []

for url in url_master:
    monologue = parse_monologue_page(url)
    
    categories = []
    
    if url in comic_men:
        categories.append('comic_men')
    
    if url in drama_men:
        categories.append('drama_men')
        
    if url in classic_men:
        categories.append('classic_men')
        
    if url in seniors:
        categories.append('seniors')

    if url in comic_women:
        categories.append('comic_women')

    if url in drama_women:
        categories.append('drama_women')

    if url in classic_women:
        categories.append('classic_women')
        
    if url in children:
        categories.append('children')
    
    monologue['category'] = tuple(categories)
    
    document_list.append(monologue)

In [126]:
#saving all scraped data as pickle to avoid re-scraping
with open('all_docs.pk1', 'wb') as f:
    pickle.dump(document_list, f)

In [127]:
#insert all of the docs at once
collection.insert_many(document_list)

<pymongo.results.InsertManyResult at 0x1150ffcf0>

In [128]:
query = collection.find()
query.next()

{'_id': ObjectId('5a9db0bfa787026d63428f81'),
 'author': 'Walter Wykes',
 'category': ['comic_men'],
 'text': "Hey! Don't touch that! That's my orange! MINE!!!  Sorry. I'm sorry. I ... I don't mean to be stingy. I'm sure you're very hungry, but I can't allow you to eat this orange. It's just that ... well, it's ... it's the key to everything! I know that doesn't seem to make much sense. I don't understand it quite yet myself. But one has to have faith, you know, that ... well, that everything will come clear in the end.  It ... it must be nice to be a halfwit. A vagrant, I mean. A wanderer. You don't have to contemplate. If you're hungry, you eat. Everything's basic. Primitive. Nothing to confuse the issue. No one to push you around ... tell you what to do. Maybe ... maybe I should join you!  Hey ... maybe ... maybe I should! They'd never find me then! And if they did ... well, they wouldn't recognize me! I'll bet people don't even give you a second look, do they?! They probably cross 

In [129]:
collection.count() #checking that all monologues made it in

465