## 1. Get text from web

In [2]:
import urllib.request, time, re, random, hashlib, nltk
from bs4 import BeautifulSoup as bs
%matplotlib notebook

In [4]:
## Compassionate Caching
# THIS WILL ENSURE THAT PAGES ARE CACHED AS FILES IN YOUR DIRECTORY, AND AVOID UNNECESSARY LOAD ON WEBSITES.  
# ALSO WHEN PAGES ARE ACTUALLY LOADED, THE REQUESTS ARE STAGGERED AS EXPECTED OF HUMAN BROWSING.
last_fetched_at = None

def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache-file-{}'.format(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
        return result

In [5]:
fetch('http://www.usatoday.com/story/news/politics/onpolitics/2016/02/02/trump-new-hampshire-probably-suits-me-better/79726718/')

Retrieving from cache: http://www.usatoday.com/story/news/politics/onpolitics/2016/02/02/trump-new-hampshire-probably-suits-me-better/79726718/


'b\'<!doctype html><!--[if IE 8]><html class="no-js lt-ie10 lt-ie9" lang="en"><![endif]--><!--[if IE 9]><html class="no-js lt-ie10" lang="en"><![endif]--><!--[if gt IE 10]><!--><html class="no-js" lang="en"  itemscope itemtype="http://schema.org/NewsArticle" ><!--<![endif]--><head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#"><title>Trump: New Hampshire \\\'probably suits me better\\\'</title><meta content="text/html; charset=UTF-8" name="Content-Type" /><meta name="google" content="notranslate"><meta name="description" itemprop="description" content="Finishing second? It&#39;s not so bad, Donald Trump told a cheering rally in New Hampshire less than 24 hours after Texas Sen. Ted Cruz defeated him in the Iowa caucuses."><link rel="canonical" href="http://www.usatoday.com/story/news/politics/onpolitics/2016/02/02/trump-new-hampshire-probably-suits-me-better/79726718/"><meta property="fb:app_id" content="215046668549694"/><meta property="og:site_name" content="USA TODAY" /><met

In [11]:
## Search for articles with a certain topic
# First, get the page with search results
def get_search_results(entity):
    """Return an html with search results for given entity."""

    result = fetch('http://www.usatoday.com/search/' + entity.replace(" ", "%20") + '/')
    
    return result

In [13]:
results_html = get_search_results("donald trump")
results_html

Retrieving from cache: http://www.usatoday.com/search/donald%20trump/


'b\'<!doctype html><!--[if IE 8]><html class="no-js lt-ie10 lt-ie9" lang="en"><![endif]--><!--[if IE 9]><html class="no-js lt-ie10" lang="en"><![endif]--><!--[if gt IE 10]><!--><html class="no-js" lang="en" ><!--<![endif]--><head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#"><title>USA TODAY</title><meta content="text/html; charset=UTF-8" name="Content-Type" /><meta name="google" content="notranslate"><meta property="fb:app_id" content="215046668549694"/><meta property="og:site_name" content="USA TODAY" /><meta name="ROBOTS" content="NOODP, NOYDIR" /><meta name="cXenseParse:pageclass" content="frontpage" /><meta name="cXenseParse:recs:category" content="search" /><meta name="cXenseParse:title" content="Search Results for &quot;donald trump&quot;" /><meta name="cXenseParse:recs:contenttype" content="search-results" /><meta name="cXenseParse:gci_asset_type" content="search-results" /><meta name="viewport" content="width=1070" /><link rel="shortcut icon" href="http://www.gannett

In [17]:
# Get a list of article urls from the search result page
def get_articles(results_html, n=3):
    """Return a list of article htmls for given search results html."""
    
    articles = []
    soup = bs(results_html, 'lxml')
    tag_set = soup.find_all('a', class_='search-result-item-link')
    for a in tag_set:
        sub_link = a.get('href')
        # exclude the links for videos
        if not sub_link.startswith('/videos/'):
            if sub_link.startswith('http'):
                articles.append(fetch(sub_link))
            else:
                articles.append(fetch('http://www.usatoday.com' + sub_link))
            if len(articles) >= n:
                break
            
    return articles[:n]

In [18]:
soup = bs(results_html, 'lxml')
print(soup.prettify())

<html>
 <body>
  <p>
   b'
   <!DOCTYPE html>
   <!--[if IE 8]><html class="no-js lt-ie10 lt-ie9" lang="en"><![endif]-->
   <!--[if IE 9]><html class="no-js lt-ie10" lang="en"><![endif]-->
   <!--[if gt IE 10]><!-->
   <!--<![endif]-->
  </p>
  <title>
   USA TODAY
  </title>
  <meta content="text/html; charset=UTF-8" name="Content-Type"/>
  <meta content="notranslate" name="google"/>
  <meta content="215046668549694" property="fb:app_id"/>
  <meta content="USA TODAY" property="og:site_name"/>
  <meta content="NOODP, NOYDIR" name="ROBOTS"/>
  <meta content="frontpage" name="cXenseParse:pageclass"/>
  <meta content="search" name="cXenseParse:recs:category"/>
  <meta content='Search Results for "donald trump"' name="cXenseParse:title"/>
  <meta content="search-results" name="cXenseParse:recs:contenttype"/>
  <meta content="search-results" name="cXenseParse:gci_asset_type"/>
  <meta content="width=1070" name="viewport"/>
  <link href="http://www.gannett-cdn.com/sites/usatoday/images/favic

In [19]:
tag_set = soup.find_all('a', class_='search-result-item-link')
tag_set

[<a class="search-result-item-link" data-ht="search_results_video-playlist_1" href="/videos/news/politics/elections/2016/2016/02/03/donald-trump-on-the-campaign-trail/71659642/"><div class="front"><figure class="search-result-figure"><img class="search-result-image" src="http://www.gannett-cdn.com/-mm-/dc6bab708a212ecd9f3b6aca35c50697de6bd32a/r=x328&amp;c=440x325/http/videos.usatoday.net/Brightcove2/29906170001/2016/02/29906170001_4736978855001_4736962853001-vs.jpg"/></figure><p class="meta"><span class="date-created meta-info-text">Feb 3, 2016</span></p><h3 class="search-result-title search-results-headline">Donald Trump on the campaign trail</h3><p class="text">Donald Trump on the campaign trail</p><div class="clearleft"></div></div><div class="back"><h3 class="search-result-title-back search-results-headline">Donald Trump on the campaign trail</h3><p class="text">Donald Trump on the campaign trail</p></div></a>,
 <a class="search-result-item-link" data-ht="search_results_text_2" hre

In [20]:
tag_set[0].get('href')

'/videos/news/politics/elections/2016/2016/02/03/donald-trump-on-the-campaign-trail/71659642/'

In [24]:
articles = get_articles(results_html, n=5)
articles[0]

Retrieving from cache: http://www.usatoday.com/story/news/politics/onpolitics/2016/02/03/donald-trump-ted-cruz-iowa-caucuses/79748926/
Retrieving from cache: http://www.usatoday.com/story/news/politics/elections/2016/2016/02/03/meet-people-who-give-money-billionaire/79710764/
Retrieving from cache: http://www.usatoday.com/story/money/columnist/rieder/2016/02/03/rieder-what-next-trump-media-circus/79751140/
Retrieving from cache: http://www.usatoday.com/story/money/columnist/rieder/2016/01/27/rieder-trump-meets-his-match-kelly/79400776/
Retrieving from cache: http://www.usatoday.com/story/news/politics/elections/2016/01/30/iowa-poll-des-moines-register-bloomberg-republicans-cruz-trump/79514706/


'b\'<!doctype html><!--[if IE 8]><html class="no-js lt-ie10 lt-ie9" lang="en"><![endif]--><!--[if IE 9]><html class="no-js lt-ie10" lang="en"><![endif]--><!--[if gt IE 10]><!--><html class="no-js" lang="en"  itemscope itemtype="http://schema.org/NewsArticle" ><!--<![endif]--><head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#"><title>Donald Trump says Ted Cruz \\\'stole\\\' Iowa</title><meta content="text/html; charset=UTF-8" name="Content-Type" /><meta name="google" content="notranslate"><meta name="description" itemprop="description" content="Suffice it to say, he&#39;s not so congratulatory anymore."><link rel="canonical" href="http://www.usatoday.com/story/news/politics/onpolitics/2016/02/03/donald-trump-ted-cruz-iowa-caucuses/79748926/"><meta property="fb:app_id" content="215046668549694"/><meta property="og:site_name" content="USA TODAY" /><meta name="ROBOTS" content="NOODP, NOYDIR" /><meta property="og:image" content="http://www.gannett-cdn.com/-mm-/4ba952a39bfa318478a9

In [None]:
from nltk import word_tokenize

def get_words(a):
    """Return list of representative words from an article 
    html."""
    
    words = []
    
    soup = bs(a, 'lxml')
    for p in soup.find_all('p', class_=None):
        text = p.get_text()
        words += word_tokenize(text.lower())

    return words

In [26]:
article = fetch('http://www.usatoday.com/story/news/politics/onpolitics/2016/02/02/trump-new-hampshire-probably-suits-me-better/79726718/')
article

Retrieving from cache: http://www.usatoday.com/story/news/politics/onpolitics/2016/02/02/trump-new-hampshire-probably-suits-me-better/79726718/


'b\'<!doctype html><!--[if IE 8]><html class="no-js lt-ie10 lt-ie9" lang="en"><![endif]--><!--[if IE 9]><html class="no-js lt-ie10" lang="en"><![endif]--><!--[if gt IE 10]><!--><html class="no-js" lang="en"  itemscope itemtype="http://schema.org/NewsArticle" ><!--<![endif]--><head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#"><title>Trump: New Hampshire \\\'probably suits me better\\\'</title><meta content="text/html; charset=UTF-8" name="Content-Type" /><meta name="google" content="notranslate"><meta name="description" itemprop="description" content="Finishing second? It&#39;s not so bad, Donald Trump told a cheering rally in New Hampshire less than 24 hours after Texas Sen. Ted Cruz defeated him in the Iowa caucuses."><link rel="canonical" href="http://www.usatoday.com/story/news/politics/onpolitics/2016/02/02/trump-new-hampshire-probably-suits-me-better/79726718/"><meta property="fb:app_id" content="215046668549694"/><meta property="og:site_name" content="USA TODAY" /><met

In [27]:
soup = bs(article, 'lxml')
p_set = soup.find_all('p', class_=None)
paragraphs = [p.get_text() for p in p_set]
paragraphs

["b'",
 "MILFORD, N.H. \\xe2\\x80\\x94\\xc2\\xa0Finishing second? It\\'s not so bad, Donald Trump told a cheering rally in New Hampshire less than 24 hours after Texas Sen. Ted Cruz defeated \\xc2\\xa0him in the Iowa caucuses.",
 '"I think we did really well, we did really well," Trump declared Tuesday night in his first public appearance since he failed to meet expectations that he would win the opening Iowa caucuses. He said his decision to skip the final Iowa debate in a dispute with Fox News might have cost him some support, but he cast it as an act of philanthropy because he hosted instead a fundraiser to help U.S. military vets.',
 '"I\\\'ll take the $6 million for the vets" over a first-place finish, he declared.',
 'Donald Trump speaks with members of the media during a news conference on Feb. 2, 2016, in Milford, N.H.\xa0(Photo: Matt Rourke, AP)',
 'Trump delivered an energetic rambling address to an enthusiastic crowd of thousands of people in a sports facility on a cold wint

In [28]:
words = []
for para in paragraphs:
    words += nltk.word_tokenize(para)
words

['b',
 "'",
 'MILFORD',
 ',',
 'N.H.',
 '\\xe2\\x80\\x94\\xc2\\xa0Finishing',
 'second',
 '?',
 'It\\',
 "'s",
 'not',
 'so',
 'bad',
 ',',
 'Donald',
 'Trump',
 'told',
 'a',
 'cheering',
 'rally',
 'in',
 'New',
 'Hampshire',
 'less',
 'than',
 '24',
 'hours',
 'after',
 'Texas',
 'Sen.',
 'Ted',
 'Cruz',
 'defeated',
 '\\xc2\\xa0him',
 'in',
 'the',
 'Iowa',
 'caucuses',
 '.',
 '``',
 'I',
 'think',
 'we',
 'did',
 'really',
 'well',
 ',',
 'we',
 'did',
 'really',
 'well',
 ',',
 "''",
 'Trump',
 'declared',
 'Tuesday',
 'night',
 'in',
 'his',
 'first',
 'public',
 'appearance',
 'since',
 'he',
 'failed',
 'to',
 'meet',
 'expectations',
 'that',
 'he',
 'would',
 'win',
 'the',
 'opening',
 'Iowa',
 'caucuses',
 '.',
 'He',
 'said',
 'his',
 'decision',
 'to',
 'skip',
 'the',
 'final',
 'Iowa',
 'debate',
 'in',
 'a',
 'dispute',
 'with',
 'Fox',
 'News',
 'might',
 'have',
 'cost',
 'him',
 'some',
 'support',
 ',',
 'but',
 'he',
 'cast',
 'it',
 'as',
 'an',
 'act',
 'of',
 

## 2. Get text from PDF

To install: run the following code in your terminal
####  *sudo pip install pyPDF2*

In [29]:
from PyPDF2 import PdfFileWriter, PdfFileReader
input_file = PdfFileReader(open("document.pdf", "rb"))

In [30]:
input_file.getNumPages()

12

In [31]:
input_file.getPage(0).extractText()

'Proceedings of the 2010 Winter Simulation Conference \nB. Johansson, S. Jain, J. Montoya-Torres, J. Hugan, and E. Yücesan, eds. \n   TO AGENT-BASED SIMULATION FROM SYSTEM DYNAMICS \n Charles M. Macal \n Argonne National Laboratory \nCenter for Complex Adaptive Agent Systems Simulation (CAS\n2) \n9700 S. Cass Ave. \nArgonne, IL 60439, USA \n ABSTRACT \nAgent-based simulation (ABS) is a recent modeling technique that is being widely used in modeling \ncomplex social systems. Forrester™s System Dynamics (SD) is another longstanding technique for model-\ning social systems. Several classical models of systems, such as the Kermack-McKendrick model of epi-\n\ndemiology, the Lotka-Volterra equations for modeling predator-prey relationships, and the Bass model for \ninnovation diffusion are formulated as systems of differential equations and have corresponding System \nDynamics representations as difference equations. The ABS and SD modeling approaches take funda-\nmentally different perspect

In [32]:
text = ''
for page_id in range(input_file.getNumPages()):
    text += input_file.getPage(page_id).extractText()
text

'Proceedings of the 2010 Winter Simulation Conference \nB. Johansson, S. Jain, J. Montoya-Torres, J. Hugan, and E. Yücesan, eds. \n   TO AGENT-BASED SIMULATION FROM SYSTEM DYNAMICS \n Charles M. Macal \n Argonne National Laboratory \nCenter for Complex Adaptive Agent Systems Simulation (CAS\n2) \n9700 S. Cass Ave. \nArgonne, IL 60439, USA \n ABSTRACT \nAgent-based simulation (ABS) is a recent modeling technique that is being widely used in modeling \ncomplex social systems. Forrester™s System Dynamics (SD) is another longstanding technique for model-\ning social systems. Several classical models of systems, such as the Kermack-McKendrick model of epi-\n\ndemiology, the Lotka-Volterra equations for modeling predator-prey relationships, and the Bass model for \ninnovation diffusion are formulated as systems of differential equations and have corresponding System \nDynamics representations as difference equations. The ABS and SD modeling approaches take funda-\nmentally different perspect

In [33]:
nltk.word_tokenize(text)

['Proceedings',
 'of',
 'the',
 '2010',
 'Winter',
 'Simulation',
 'Conference',
 'B.',
 'Johansson',
 ',',
 'S.',
 'Jain',
 ',',
 'J.',
 'Montoya-Torres',
 ',',
 'J.',
 'Hugan',
 ',',
 'and',
 'E.',
 'Yücesan',
 ',',
 'eds',
 '.',
 'TO',
 'AGENT-BASED',
 'SIMULATION',
 'FROM',
 'SYSTEM',
 'DYNAMICS',
 'Charles',
 'M.',
 'Macal',
 'Argonne',
 'National',
 'Laboratory',
 'Center',
 'for',
 'Complex',
 'Adaptive',
 'Agent',
 'Systems',
 'Simulation',
 '(',
 'CAS',
 '2',
 ')',
 '9700',
 'S.',
 'Cass',
 'Ave.',
 'Argonne',
 ',',
 'IL',
 '60439',
 ',',
 'USA',
 'ABSTRACT',
 'Agent-based',
 'simulation',
 '(',
 'ABS',
 ')',
 'is',
 'a',
 'recent',
 'modeling',
 'technique',
 'that',
 'is',
 'being',
 'widely',
 'used',
 'in',
 'modeling',
 'complex',
 'social',
 'systems',
 '.',
 'Forrester™s',
 'System',
 'Dynamics',
 '(',
 'SD',
 ')',
 'is',
 'another',
 'longstanding',
 'technique',
 'for',
 'model-',
 'ing',
 'social',
 'systems',
 '.',
 'Several',
 'classical',
 'models',
 'of',
 'syste

## 3. Get text from social media

To install: run the following code in your terminal
####  *sudo pip install python-twitter*

In [34]:
import twitter

CONSUMER_KEY = '<...>'
CONSUMER_SECRET = '<...>'
OAUTH_TOKEN = '<...>'
OAUTH_TOKEN_SECRET = '<...>'

auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                           CONSUMER_KEY, CONSUMER_SECRET)

twitter_api = twitter.Twitter(auth=auth)

In [36]:
q = '#humanservices'
search_results = twitter_api.search.tweets(q=q, count=100)
search_results
statuses = search_results['statuses']

In [38]:
statuses[0]['text']

'HSHS intro video 12 13 12 BC https://t.co/rNxMEAXNtI via @YouTube\n#Pathways #Academies #HumanServices #HealthSciences #CareerPath'

In [39]:
status_texts = [ status['text'] for status in statuses ]
status_texts

['HSHS intro video 12 13 12 BC https://t.co/rNxMEAXNtI via @YouTube\n#Pathways #Academies #HumanServices #HealthSciences #CareerPath',
 'The @MassCouncil analysis of the H2 Budget. #HumanServices #mapoli https://t.co/Sqp7iHSAYT',
 'Want to help your #community? Discover #degrees in #HumanServices today! https://t.co/7w3F0yMt0M https://t.co/c651Fl26Ax',
 'How can #HumanServices agencies move from #childsupport collections to collaboration? https://t.co/QeSoGJbY24 https://t.co/weIYxhpl5a',
 'What exactly IS the National #HumanServices #Interoperability Architecture? (NHSIA) https://t.co/fQYznS4ipN',
 'Why’s failure the key to success in #HumanServices? Read more https://t.co/4jjNrDxzT8 https://t.co/Ew8nzKJVkQ',
 'Explore the journey towards a generative business model in #HumanServices  https://t.co/54qpBdVZk1',
 'Shout out:\nLongstanding exemplary work @santamonicacity \n#housing  #humanservices \n@SantaMonicaPD \nto #endhomelessness \n@RANDCorporation',
 'RT @AccenturePubSvc: #HumanSer