# Using requests - low level, by sending HTTP requests directly (vs. using a higher-level library)

In [4]:
import requests

In [2]:
response = requests.get('https://en.wikipedia.org/wiki/Nobel_Prize')
dir(response)

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [4]:
response.status_code

200

In [5]:
response.headers

{'Via': '1.1 varnish-v4, 1.1 varnish-v4', 'Age': '26376', 'Vary': 'Accept-Encoding,Cookie,Authorization', 'Backend-Timing': 'D=1246853 t=1482261890469998', 'X-Varnish': '650829615 656834659, 349523744 78702209', 'Server': 'mw1264.eqiad.wmnet', 'X-Powered-By': 'HHVM/3.12.7', 'Date': 'Wed, 21 Dec 2016 02:44:27 GMT', 'Cache-Control': 'private, s-maxage=0, max-age=0, must-revalidate', 'Accept-Ranges': 'bytes', 'X-Content-Type-Options': 'nosniff', 'Content-Encoding': 'gzip', 'Set-Cookie': 'WMF-Last-Access=21-Dec-2016;Path=/;HttpOnly;secure;Expires=Sun, 22 Jan 2017 00:00:00 GMT, GeoIP=US:ID:Ketchum:43.78:-114.63:v4; Path=/; secure; Domain=.wikipedia.org', 'P3P': 'CP="This is not a P3P policy! See https://en.wikipedia.org/wiki/Special:CentralAutoLogin/P3P for more info."', 'X-Analytics': 'ns=0;page_id=21201;https=1;nocookies=1', 'X-Cache': 'cp1066 hit/4, cp1053 hit/12', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'X-Client-IP': '184.183.121.100', 'Last-Modifie

In [6]:
response.text

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Nobel Prize - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Nobel_Prize","wgTitle":"Nobel Prize","wgCurRevisionId":755131091,"wgRevisionId":755131091,"wgArticleId":21201,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 Swedish-language sources (sv)","EngvarB from November 2016","Use dmy dates from August 2012","Articles containing potentially dated statements from 2015","All articles containing potentially dated statements","Articles containing Norwegian-language text","Articles containing potentially dated statements from 2012","Artic

In [7]:
response = requests.get('https://cdph.data.ca.gov/api/views/6tej-5zx7/rows.json?accessType=DOWNLOAD')
response.status_code

403

In [1]:
OECD_ROOT_URL = 'http://stats.oecd.org/sdmx-json/data'

In [5]:
def make_OECD_request(dsname, dimensions, params=None, root_dir=OECD_ROOT_URL):
    if not params:
        params = {}
        
    dim_args = ['+'.join(d) for d in dimensions]
    dim_str = '.'.join(dim_args)
    
    url = root_dir + '/' + dsname + '/' + dim_str + '/all'
    print('Requesting URL: ' + url)
    return requests.get(url, params=params)

In [9]:
response = make_OECD_request('QNA',
                            (('USA','AUS'),('GDP','B1_GE'),
                             ('CUR','VOBARSA'), ('Q')),
                             {'startTime': '2009-Q1',
                              'endTime': '2010-Q1'})
response.status_code

Requesting URL: http://stats.oecd.org/sdmx-json/data/QNA/USA+AUS.GDP+B1_GE.CUR+VOBARSA.Q/all


200

In [13]:
if response.status_code == 200:
    json = response.json()
    print(json.keys())

dict_keys(['header', 'structure', 'dataSets'])


In [12]:
json.keys()

dict_keys(['header', 'structure', 'dataSets'])

## restcountries.eu

In [14]:
REST_EU_ROOT_URL = 'http://restcountries.eu/rest/v1'

In [29]:
def REST_country_request(field='all', name=None, params=None):
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    if not params:
        params = {}
        
    if field == 'all':
        return requests.get(REST_EU_ROOT_URL + '/all')
    
    url = '%s/%s/%s'%(REST_EU_ROOT_URL, field, name)
    print('Requesting URL: ' + url)
    response = requests.get(url, params=params, headers=headers)
    
    if not response.status_code == 200:
        raise Exception('Request failed with status code' + str(response.status_code))
        
    return response

In [16]:
# all countries using the US dollar
response = REST_country_request('currency', 'usd')
response.json()

Requesting URL: http://restcountries.eu/rest/v1/currency/usd


[{'alpha2Code': 'AS',
  'alpha3Code': 'ASM',
  'altSpellings': ['AS', 'Amerika Sāmoa', 'Amelika Sāmoa', 'Sāmoa Amelika'],
  'area': 199.0,
  'borders': [],
  'callingCodes': ['1684'],
  'capital': 'Pago Pago',
  'currencies': ['USD'],
  'demonym': 'American Samoan',
  'gini': None,
  'languages': ['en', 'sm'],
  'latlng': [-14.33333333, -170.0],
  'name': 'American Samoa',
  'nativeName': 'American Samoa',
  'numericCode': '016',
  'population': 55519,
  'region': 'Oceania',
  'relevance': '0.5',
  'subregion': 'Polynesia',
  'timezones': ['UTC-11:00'],
  'topLevelDomain': ['.as'],
  'translations': {'de': 'Amerikanisch-Samoa',
   'es': 'Samoa Americana',
   'fr': 'Samoa américaines',
   'it': 'Samoa Americane',
   'ja': 'アメリカ領サモア'}},
 {'alpha2Code': 'BQ',
  'alpha3Code': 'BES',
  'altSpellings': ['BQ', 'Boneiru'],
  'area': 294.0,
  'borders': [],
  'callingCodes': ['5997'],
  'capital': 'Kralendijk',
  'currencies': ['USD'],
  'demonym': 'Dutch',
  'gini': None,
  'languages': ['nl']

And, since the data set is small, store locally in Mongo.

In [19]:
DB_NOBEL_PRIZE = 'nobel_prize'
COLL_WINNERS = 'winners'
COLL_COUNTRIES = 'country_data'

In [21]:
import pymongo

In [22]:
def get_mongo_database(db_name, host='localhost', 
                       port=27017, username=None, password=None):
    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s/%s'%(username, password, host, db_name)
        conn = pymongo.MongoClient(mongo_uri)
    else:
        conn = pymongo.MongoClient(host, port)
        
    return conn[db_name]

In [23]:
db_nobel = get_mongo_database(DB_NOBEL_PRIZE)
col = db_nobel[COLL_COUNTRIES]

In [24]:
# get all country data (by passing no params)
response = REST_country_request()

In [27]:
# and save the documents directly to the database
col.insert_many(response.json())

<pymongo.results.InsertManyResult at 0x107677438>

And now we can query the database to get all countries using the US dollar (same results as above, should be).

In [28]:
res = col.find({'currencies':{'$in':['USD']}})
list(res)

[{'_id': ObjectId('585b0f9e5c0a210295c08345'),
  'alpha2Code': 'AS',
  'alpha3Code': 'ASM',
  'altSpellings': ['AS', 'Amerika Sāmoa', 'Amelika Sāmoa', 'Sāmoa Amelika'],
  'area': 199.0,
  'borders': [],
  'callingCodes': ['1684'],
  'capital': 'Pago Pago',
  'currencies': ['USD'],
  'demonym': 'American Samoan',
  'gini': None,
  'languages': ['en', 'sm'],
  'latlng': [-14.33333333, -170.0],
  'name': 'American Samoa',
  'nativeName': 'American Samoa',
  'numericCode': '016',
  'population': 55519,
  'region': 'Oceania',
  'relevance': '0.5',
  'subregion': 'Polynesia',
  'timezones': ['UTC-11:00'],
  'topLevelDomain': ['.as'],
  'translations': {'de': 'Amerikanisch-Samoa',
   'es': 'Samoa Americana',
   'fr': 'Samoa américaines',
   'it': 'Samoa Americane',
   'ja': 'アメリカ領サモア'}},
 {'_id': ObjectId('585b0f9e5c0a210295c0835c'),
  'alpha2Code': 'BQ',
  'alpha3Code': 'BES',
  'altSpellings': ['BQ', 'Boneiru'],
  'area': 294.0,
  'borders': [],
  'callingCodes': ['5997'],
  'capital': 'Kra

# Tweepy and Twitter

In [32]:
import json

In [37]:
with open('keys.txt') as key_file:
    keys = json.load(key_file)

In [38]:
import tweepy

In [39]:
auth = tweepy.OAuthHandler(keys['consumer_key'], keys['consumer_secret'])
auth.set_access_token(keys['access_token'], keys['access_toket_secret'])
api = tweepy.API(auth)

In [40]:
public_tweets = api.home_timeline()
for tweet in public_tweets:
    print(tweet.text)

We found this charming spot to eat noodles. @ Gontaro Of Kyoto https://t.co/zsX2HDcQgx
RT @ClickHole: You’re A Computer. Can You Pass The Turing Test? https://t.co/5Gs7Dfyt6Y, https://t.co/TxcABlxIGC
The Golden Pavillion. @ Kinkaku-ji https://t.co/5s1jIhSatp
RT @margotcodes: Love seeing how companies give back to their communities and to organizations that matter. Thanks @StackOverflow: https://…
RT @psaffo: 2.9 million votes: that's larger than the populations of 15 states incl Nebraska, N/S Dakota, Montana, Maine, New Hampshire, Wy…
This handheld GBA/GB emulator is fantastic. Works perfectly, great controls and screen https://t.co/3C7HjWujWW
RT @oldirtyfuckaroo: he matches his drink and shirt everyday https://t.co/Rnh9BtfKvC
RT @washingtonpost: After legalization, teen marijuana use drops sharply in Colorado
https://t.co/kucNSOvuU1
The 2017 PAX South Tabletop Indie Showcase is live! They're all rad titles and we're VERY proud to have them at PAX! https://t.co/p8YqYevPVd
Why does Flas

In [41]:
from tweepy.streaming import StreamListener

In [42]:
class MyStreamListener(StreamListener):
    
    def __init__(self, api, **kw):
        self.api = api
        super(tweepy.StreamListener, self).__init__()
        self.col = get_mongo_database('tweets', **kw)['tweets']
        
    def on_data(self, tweet):
        self.col.insert(json.loads(tweet))
        
    def on_error(self, status):
        return True  # keep stream open

This will run until you break/interrupt it - it'll continually get new tweets and write them to the Mongo db called tweets. You can see the data easily in the database using mongo shell (or Python). With Mongo shell, run 'mongo', then 'use tweets' to switch to the database, then 'db.tweets.find()' to show all the tweets/documents in the database.

In [47]:
# uncomment to fill db w/ data from the stream listener
#stream = tweepy.Stream(auth, MyStreamListener(api))
#stream.filter(track=['python', 'javascript', 'dataviz'])

# BeautifulSoup

In [48]:
from bs4 import BeautifulSoup

In [93]:
BASE_URL = 'http://en.wikipedia.org'
HEADERS = {'User-Agent': 'Mozilla/5.0'} # otherwise Wikipedia rejects the request

def get_response_from_url(url_suffix):
    response = requests.get(BASE_URL + url_suffix, headers=HEADERS)
    return response

def get_Nobel_soup():
    """Return a parsed tag tree from the Nobel Prize page."""
    response = get_response_from_url('/wiki/List_of_Nobel_laureates')
    
    return BeautifulSoup(response.content, 'lxml')

In [94]:
soup = get_Nobel_soup()

This uses classes, but isn't robust because it relies even on things like the order of classes as they're used in the HTML.

In [95]:
soup.find('table', {'class':'wikitable sortable'})

<table class="wikitable sortable">
<tr>
<th>Year</th>
<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a></th>
<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a></th>
<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>
or Medicine</a></th>
<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a></th>
<th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a></th>
<th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a></th>
</tr>
<tr>
<td align="center">1901</td>
<td><span class="sortkey">Röntgen, Wilhelm</span><spa

Somewhat more robust is to use the CSS selectors (that you get when you specify the lxml parser). For example, order doesn't matter, as the following two snippets show.

In [56]:
soup.select('table.sortable.wikitable')

[<table class="wikitable sortable">
 <tr>
 <th>Year</th>
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a></th>
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a></th>
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>
 or Medicine</a></th>
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a></th>
 <th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a></th>
 <th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a></th>
 </tr>
 <tr>
 <td align="center">1901</td>
 <td><span class="sortkey">Röntgen, Wilh

In [57]:
soup.select('table.wikitable.sortable')

[<table class="wikitable sortable">
 <tr>
 <th>Year</th>
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a></th>
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a></th>
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>
 or Medicine</a></th>
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a></th>
 <th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a></th>
 <th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a></th>
 </tr>
 <tr>
 <td align="center">1901</td>
 <td><span class="sortkey">Röntgen, Wilh

In [58]:
table = soup.select_one('table.sortable.wikitable')
table.select('th')

[<th>Year</th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a></th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a></th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>
 or Medicine</a></th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a></th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a></th>,
 <th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a></th>,
 <th>Year</th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in 

In [59]:
# equivalent to table.select('th')
table('th')

[<th>Year</th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a></th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a></th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>
 or Medicine</a></th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a></th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a></th>,
 <th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a></th>,
 <th>Year</th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in 

In [60]:
def get_column_titles(table):
    cols = []
    
    for th in table.select_one('tr').select('th')[1:]:
        link = th.select_one('a')
        if link:
            cols.append({'name':link.text,
                         'href':link.attrs['href']})
        else:
            cols.append({'name':th.text, 'href':None})
            
    return cols            

In [61]:
get_column_titles(table)

[{'href': '/wiki/List_of_Nobel_laureates_in_Physics', 'name': 'Physics'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Chemistry', 'name': 'Chemistry'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine',
  'name': 'Physiology\nor Medicine'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Literature', 'name': 'Literature'},
 {'href': '/wiki/List_of_Nobel_Peace_Prize_laureates', 'name': 'Peace'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Economics', 'name': 'Economics'}]

In [72]:
def get_Nobel_winners(table):
    cols = get_column_titles(table)
    winners = []
    
    # one row per year, with columns for each winner
    for row in table.select('tr')[1:-1]:
        year = int(row.select_one('td').text[:4]) # gets first <td>
        # then cycle through all columns creating a Winner for each
        for i, td in enumerate(row.select('td')[1:]):
            for winner in td.select('a'):
                href = winner.attrs['href']
                if not href.startswith('#endnote'):
                    winners.append({
                        'year':year,
                        'category':cols[i]['name'],
                        'name':winner.text,
                        'link':winner.attrs['href']
                    })
    
    return winners

In [82]:
winners = get_Nobel_winners(table)
winners

[{'category': 'Physics',
  'link': '/wiki/Wilhelm_R%C3%B6ntgen',
  'name': 'Wilhelm Röntgen',
  'year': 1901},
 {'category': 'Chemistry',
  'link': '/wiki/Jacobus_Henricus_van_%27t_Hoff',
  'name': "Jacobus Henricus van 't Hoff",
  'year': 1901},
 {'category': 'Physiology\nor Medicine',
  'link': '/wiki/Emil_Adolf_von_Behring',
  'name': 'Emil Adolf von Behring',
  'year': 1901},
 {'category': 'Literature',
  'link': '/wiki/Sully_Prudhomme',
  'name': 'Sully Prudhomme',
  'year': 1901},
 {'category': 'Peace',
  'link': '/wiki/Henry_Dunant',
  'name': 'Henry Dunant',
  'year': 1901},
 {'category': 'Peace',
  'link': '/wiki/Fr%C3%A9d%C3%A9ric_Passy',
  'name': 'Frédéric Passy',
  'year': 1901},
 {'category': 'Physics',
  'link': '/wiki/Hendrik_Lorentz',
  'name': 'Hendrik Lorentz',
  'year': 1902},
 {'category': 'Physics',
  'link': '/wiki/Pieter_Zeeman',
  'name': 'Pieter Zeeman',
  'year': 1902},
 {'category': 'Chemistry',
  'link': '/wiki/Hermann_Emil_Fischer',
  'name': 'Hermann Emil

## Retrieve people-specific pages (just once), and cache them

In [99]:
import requests_cache
requests_cache.install_cache('nobel_pages',
                             backend='sqlite', 
                             expire_after=7200)  # 7200s = 2hrs 
# now we just use requests as normal, and the results are cached
# (and, I think, our use of requests.get will use the cached data
# automatically, which is faster for ex when you're working w/ many 
# pages, and so nicer for us too)
# the official docs also have info about request throttling, made easy

In [96]:
# use the person-specific wikipedia page to get their nationality
def get_winner_nationality(w):
    data = get_response_from_url(w['link']).content
    soup = BeautifulSoup(data, 'lxml')
    person_data = {'name': w['name']}
    attr_rows = soup.select('table.infobox tr')
    for tr in attr_rows:
        try:
            attribute = tr.select_one('th').text
            if attribute == 'Nationality':
                person_data[attribute] = tr.select_one('td').text
        except AttributeError:
            pass
        
    return person_data

In [100]:
# how many pages of the first 50 won't work w/ the above attempt?
wdata = []
for w in winners[:50]:
    wdata.append(get_winner_nationality(w))
missing_nationality = []
for w in wdata:
    if not w.get('Nationality'):
        missing_nationality.append(w)
        
missing_nationality

[{'name': 'Élie Ducommun'},
 {'name': 'Charles Albert Gobat'},
 {'name': 'Marie Curie'},
 {'name': 'Niels Ryberg Finsen'},
 {'name': 'Institut de Droit International'},
 {'name': 'Bertha von Suttner'},
 {'name': 'Santiago Ramón y Cajal'},
 {'name': 'Theodore Roosevelt'},
 {'name': 'Ernesto Teodoro Moneta'},
 {'name': 'Louis Renault'},
 {'name': 'Paul Ehrlich'},
 {'name': 'Rudolf Christoph Eucken'},
 {'name': 'Klas Pontus Arnoldson'}]