# 5 Getting Data off the Web with Python

## Using Python to Consume Data from a Web API

### Using a RESTful Web API with requests

In [3]:
# 5.1
import requests

OECD_ROOT_URL = 'http://stats.oecd.org/sdmx-json/data'

def make_OECD_request(dsname, dimensions, params=None,
                      root_dir=OECD_ROOT_URL):
    """ Make a URL for the OECD API and return a response """
    
    if not params:
        params = {}
    
    dim_args = ['+'.join(d) for d in dimensions]
    dim_str = '.'.join(dim_args)
    
    url = root_dir + '/' + dsname + '/' + dim_str + '/all'
    print('Requestion URL: ' + url)
    return requests.get(url, params=params)

In [4]:
response = make_OECD_request('QNA',
    (('USA', 'AUS'), ('GDP', 'B1_GE'), ('CUR', 'VOBARSA'), ('Q')),
    {'startTime': '2009-Q1', 'endTime': '2009-Q1'})

Requestion URL: http://stats.oecd.org/sdmx-json/data/QNA/USA+AUS.GDP+B1_GE.CUR+VOBARSA.Q/all


In [8]:
if response.status_code == 200:
    json = response.json()
    print(json.keys())

dict_keys(['header', 'dataSets', 'structure'])


### Getting Country Data for the Nobel Dataviz

In [10]:
REST_EU_ROOT_URL = 'http://restcountries.eu/rest/v1'

def REST_country_request(field='all', name=None, params=None):
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    if not params:
        params = {}
    
    if field == 'all':
        return requests.get(REST_EU_ROOT_URL + '/all')
    
    url = f'{REST_EU_ROOT_URL}/{field}/{name}'
    print('Requesting URL: ' + url)
    response = requests.get(url, params=params, headers=headers)
    
    if response.status_code != 200:
        raise Exception('Request failed with status code '
                        + str(response.status_code))
        
    return response

In [11]:
response = REST_country_request('currency', 'usd')
response.json()

Requesting URL: http://restcountries.eu/rest/v1/currency/usd


[{'name': 'American Samoa',
  'topLevelDomain': ['.as'],
  'alpha2Code': 'AS',
  'alpha3Code': 'ASM',
  'callingCodes': ['1684'],
  'capital': 'Pago Pago',
  'altSpellings': ['AS', 'Amerika Sāmoa', 'Amelika Sāmoa', 'Sāmoa Amelika'],
  'region': 'Oceania',
  'subregion': 'Polynesia',
  'population': 55519,
  'latlng': [-14.33333333, -170.0],
  'demonym': 'American Samoan',
  'area': 199.0,
  'gini': None,
  'timezones': ['UTC-11:00'],
  'borders': [],
  'nativeName': 'American Samoa',
  'numericCode': '016',
  'currencies': ['USD'],
  'languages': ['en', 'sm'],
  'translations': {'de': 'Amerikanisch-Samoa',
   'es': 'Samoa Americana',
   'fr': 'Samoa américaines',
   'ja': 'アメリカ領サモア',
   'it': 'Samoa Americane'},
  'relevance': '0.5'},
 {'name': 'Bonaire',
  'topLevelDomain': ['.an', '.nl'],
  'alpha2Code': 'BQ',
  'alpha3Code': 'BES',
  'callingCodes': ['5997'],
  'capital': 'Kralendijk',
  'altSpellings': ['BQ', 'Boneiru'],
  'region': 'Americas',
  'subregion': 'Caribbean',
  'popula

In [13]:
from pymongo import MongoClient

def get_mongo_database(db_name, host='pi.hole',
                       port=27017, username=None, password=None):
    """ Get named database from MongoDB with/out authentication """
    if username and password:
        mongo_url = f'mongodb://{username}:{password}@{host}/{db_name}'
        conn = MongoClient(mongo_url)
    else:
        conn = MongoClient(host, port)
        
    return conn[db_name]

In [14]:
db_nobel = get_mongo_database('nobel_prize')
col = db_nobel['country_data']

In [15]:
response = REST_country_request()
col.insert(response.json())

  


[ObjectId('5d960a199f8adcaf3b5033e6'),
 ObjectId('5d960a199f8adcaf3b5033e7'),
 ObjectId('5d960a199f8adcaf3b5033e8'),
 ObjectId('5d960a199f8adcaf3b5033e9'),
 ObjectId('5d960a199f8adcaf3b5033ea'),
 ObjectId('5d960a199f8adcaf3b5033eb'),
 ObjectId('5d960a199f8adcaf3b5033ec'),
 ObjectId('5d960a199f8adcaf3b5033ed'),
 ObjectId('5d960a199f8adcaf3b5033ee'),
 ObjectId('5d960a199f8adcaf3b5033ef'),
 ObjectId('5d960a199f8adcaf3b5033f0'),
 ObjectId('5d960a199f8adcaf3b5033f1'),
 ObjectId('5d960a199f8adcaf3b5033f2'),
 ObjectId('5d960a199f8adcaf3b5033f3'),
 ObjectId('5d960a199f8adcaf3b5033f4'),
 ObjectId('5d960a199f8adcaf3b5033f5'),
 ObjectId('5d960a199f8adcaf3b5033f6'),
 ObjectId('5d960a199f8adcaf3b5033f7'),
 ObjectId('5d960a199f8adcaf3b5033f8'),
 ObjectId('5d960a199f8adcaf3b5033f9'),
 ObjectId('5d960a199f8adcaf3b5033fa'),
 ObjectId('5d960a199f8adcaf3b5033fb'),
 ObjectId('5d960a199f8adcaf3b5033fc'),
 ObjectId('5d960a199f8adcaf3b5033fd'),
 ObjectId('5d960a199f8adcaf3b5033fe'),
 ObjectId('5d960a199f8adc

In [18]:
res = col.find({'currencies': {'$in': ['USD']}})
list(res)

[{'_id': ObjectId('5d960a199f8adcaf3b5033ea'),
  'name': 'American Samoa',
  'topLevelDomain': ['.as'],
  'alpha2Code': 'AS',
  'alpha3Code': 'ASM',
  'callingCodes': ['1684'],
  'capital': 'Pago Pago',
  'altSpellings': ['AS', 'Amerika Sāmoa', 'Amelika Sāmoa', 'Sāmoa Amelika'],
  'region': 'Oceania',
  'subregion': 'Polynesia',
  'population': 55519,
  'latlng': [-14.33333333, -170.0],
  'demonym': 'American Samoan',
  'area': 199.0,
  'gini': None,
  'timezones': ['UTC-11:00'],
  'borders': [],
  'nativeName': 'American Samoa',
  'numericCode': '016',
  'currencies': ['USD'],
  'languages': ['en', 'sm'],
  'translations': {'de': 'Amerikanisch-Samoa',
   'es': 'Samoa Americana',
   'fr': 'Samoa américaines',
   'ja': 'アメリカ領サモア',
   'it': 'Samoa Americane'},
  'relevance': '0.5'},
 {'_id': ObjectId('5d960a199f8adcaf3b503401'),
  'name': 'Bonaire',
  'topLevelDomain': ['.an', '.nl'],
  'alpha2Code': 'BQ',
  'alpha3Code': 'BES',
  'callingCodes': ['5997'],
  'capital': 'Kralendijk',
  'a

## Using Libraries to Access Web APIs

### Using Google Spreadsheets

In [28]:
!ls

 3.ipynb   5.ipynb  'drive api-c8dae2f28f6a.json'   sandpit
 4.1	   data     'drive api-fd68a7e801d0.json'


In [32]:
import json
import gspread
from oauth2client.service_account import ServiceAccountCredentials

scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

credentials = ServiceAccountCredentials.from_json_keyfile_name(
    'drive api-fd68a7e801d0.json', scope)

gc = gspread.authorize(credentials)


In [38]:
ss = gc.open('Microbe-scope')

In [43]:
ss.worksheets()

[<Worksheet 'bugs new' id:555436055>,
 <Worksheet "what's R0?" id:133603387>,
 <Worksheet 'dropoff' id:720684346>,
 <Worksheet 'outrageous facts' id:430583748>,
 <Worksheet 'notes' id:1888549677>,
 <Worksheet 'original data' id:1562128395>,
 <Worksheet 'bugs for vz' id:1759019589>,
 <Worksheet 'physicians per 1,000' id:1268911119>,
 <Worksheet 'amends' id:1001992659>,
 <Worksheet 'bugs' id:0>]

In [42]:
ws = ss.worksheet('bugs')
ws.col_values(1)

['',
 '',
 'grey = not plotted',
 'Anthrax (untreated)',
 'Bird Flu (H5N1)',
 'Bubonic Plague (untreated)',
 'C.Difficile',
 'Campylobacter',
 'Chicken Pox',
 'Cholera',
 'Dengue Fever',
 'Diphtheria',
 'E.coli',
 'Ebola',
 'Hand, Foot and Mouth',
 'Hepatitis B',
 'HIV (treated)',
 'HIV (untreated)',
 'Influenza Pandemic 1918',
 'Lyme Disease',
 'Malaria (P. falciparum)',
 'Malaria (P. malariae)',
 'Measles',
 'MERS',
 'MRSA',
 'Mumps',
 'Norovirus',
 'Pertussis (Whooping Cough)',
 'Pneumonic Plague (untreated)',
 'Polio',
 'Rabies (treated)',
 'Rabies (untreated)',
 'Common Cold',
 'Rotavirus',
 'Rubella',
 'Salmonella',
 'SARS',
 'Scarlet Fever',
 'Seasonal Flu',
 'Smallpox',
 'Swine Flu (H1N1)',
 'Syphilis (untreated)',
 'Tuberculosis (untreated)',
 'Typhoid',
 'Hantavirus',
 '',
 'Hepatitis A',
 'Shigellosis',
 'West Nile Virus',
 'Pneumonia',
 'vCJD',
 'Marburg',
 'Yellow Fever',
 'Meningitis',
 'Leprosy',
 'Dracunculiasis',
 'Helminthiases',
 'Echinococcosis',
 'Schistosomiasis',

In [44]:
import pandas as pd

df = pd.DataFrame(ws.get_all_records())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 14 columns):
                                                                                                                                                                                                                                                                                 62 non-null object
https://www.google.co.uk/search?q=%22yellow+fever%22+R0+OR+%22basic+reproductive+rate%22+OR+%22basic+reproduction+number%22&oq=%22yellow+fever%22+R0+OR+%22basic+reproductive+rate%22+OR+%22basic+reproduction+number%22&aqs=chrome..69i57.18806j0j7&sourceid=chrome&ie=UTF-8    62 non-null object
TRANSMISSION                                                                                                                                                                                                                                                                     62 non-null object
SYMPTOMOLOGY         

### Using the Twitter API with Tweepy

In [45]:
consumer_key = 'LeHrRarqGNeAhITkDkLeQNpet'
consumer_secret = 'ItkOl2KNBJp65q7kNwK7wxPfc84K3xXLRQrUimiIlrYQDUBbzk'
access_token = '1171540892114542592-gsNkCMRrKTVXUJmnxRrJgpUfkA3F2N'
access_token_secret = 'FvroUqJxZjG2he385sOfG3UQunPYCknidZ5AXPpx7bojl'

In [49]:
import tweepy

auth = tweepy.OAuthHandler(consumer_key,
                           consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

public_tweets = api.home_timeline()
for tweet in public_tweets:
    print(tweet.text)

In [50]:
public_tweets

[]

In [52]:
my_follower_ids = api.followers_ids()

for id in my_follower_ids:
    followers = api.followers_ids(id)

In [53]:
my_follower_ids

[]

In [54]:
followers

NameError: name 'followers' is not defined

## Scraping Data

### Getting the Soup

In [60]:
from bs4 import BeautifulSoup
import requests

BASE_URL = 'http://en.wikipedia.org'
HEADERS = {'User-Agent': 'Mozilla/5.0'}

def get_Nobel_soup():
    response = requests.get(
        BASE_URL + '/wiki/List_of_Nobel_laureates',
        headers=HEADERS)
    return BeautifulSoup(response.content, 'lxml')

### Selecting Tags

In [61]:
soup = get_Nobel_soup()

In [62]:
soup.find('table', {'class': 'wikitable sortable'})

<table class="wikitable sortable">
<tbody><tr>
<th>Year
</th>
<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a>
</th>
<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a>
</th>
<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>or Medicine</a>
</th>
<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a>
</th>
<th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a>
</th>
<th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a>
<p>(The Sveriges Riksbank Prize)<sup class="reference" id="cite_ref-11"><a href="#cite

In [63]:
soup.find('table', {'class': 'sortable wikitable'})

In [64]:
soup.select('table.sortable.wikitable')

[<table class="wikitable sortable">
 <tbody><tr>
 <th>Year
 </th>
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a>
 </th>
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a>
 </th>
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>or Medicine</a>
 </th>
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a>
 </th>
 <th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a>
 </th>
 <th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a>
 <p>(The Sveriges Riksbank Prize)<sup class="reference" id="cite_ref-11

In [65]:
table = soup.select_one('table.sortable.wikitable')

In [66]:
table.select('th')

[<th>Year
 </th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a>
 </th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a>
 </th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>or Medicine</a>
 </th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a>
 </th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a>
 </th>,
 <th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a>
 <p>(The Sveriges Riksbank Prize)<sup class="reference" id="cite_ref-11"><a href="#cite_note-11">[11]</a></sup>
 <

In [67]:
table('th')

[<th>Year
 </th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a>
 </th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a>
 </th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>or Medicine</a>
 </th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a>
 </th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a>
 </th>,
 <th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a>
 <p>(The Sveriges Riksbank Prize)<sup class="reference" id="cite_ref-11"><a href="#cite_note-11">[11]</a></sup>
 <

### Crafting Selection Patterns

In [68]:
def get_column_titles(table):
    """ Get the Nobel categories from the table header """
    cols = []
    for th in table.select_one('tr').select('th')[1:]:
        link = th.select_one('a')
        if link:
            cols.append({'name': link.text,
                         'href': link.attrs['href']})
        else:
            cols.append({'name': th.text, 'href': None})
    return cols

In [69]:
get_column_titles(table)

[{'name': 'Physics', 'href': '/wiki/List_of_Nobel_laureates_in_Physics'},
 {'name': 'Chemistry', 'href': '/wiki/List_of_Nobel_laureates_in_Chemistry'},
 {'name': 'Physiologyor Medicine',
  'href': '/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine'},
 {'name': 'Literature', 'href': '/wiki/List_of_Nobel_laureates_in_Literature'},
 {'name': 'Peace', 'href': '/wiki/List_of_Nobel_Peace_Prize_laureates'},
 {'name': 'Economics', 'href': '/wiki/List_of_Nobel_laureates_in_Economics'}]

In [70]:
def get_Nobel_winners(table):
    cols = get_column_titles(table)
    winners = []
    for row in table.select('tr')[1:-1]:
        year = int(row.select_one('td').text)
        for i, td in enumerate(row.select('td')[1:]):
            for winner in td.select('a'):
                href = winner.attrs['href']
                if not href.startswith('#endnote'):
                    winners.append({
                        'year': year,
                        'category': cols[i]['name'],
                        'name': winner.text,
                        'link': winner.attrs['href']
                    })
        return winners

In [77]:
winners = get_Nobel_winners(table)

### Caching the Web Pages

In [74]:
import requests
import requests_cache

requests_cache.install_cache()

### Scraping the Winners' Nationalities

In [79]:
# 5.3

def get_winner_nationality(w):
    """ scrape biographic data from the winner's wikipedia page """
    data = requests.get('http://en.wikipedia.org' + w['link']).text
    soup = BeautifulSoup(data)
    person_data = {'name': w['name']}
    attr_rows = soup.select('table.infobox tr')
    for tr in attr_rows:
        try:
            attribute = tr.select_one('th').text
            if attribute == 'Nationality':
                person_data[attribute] = tr.select_one('td').text
        except AttributeError:
            pass
        
    return person_data

In [80]:
# 5.4

wdata = []
for w in winners[:50]:
    wdata.append(get_winner_nationality(w))
missing_nationality = []
for w in wdata:
    if not w.get('Nationality'):
        missing_nationality.append(w)
missing_nationality

[]

[{'name': 'Wilhelm Röntgen', 'Nationality': 'German[1]'},
 {'name': "Jacobus Henricus van 't Hoff", 'Nationality': 'Dutch'},
 {'name': 'Emil Adolf von Behring', 'Nationality': 'German'},
 {'name': 'Sully Prudhomme', 'Nationality': 'French'},
 {'name': 'Henry Dunant', 'Nationality': 'Swiss'},
 {'name': 'Frédéric Passy', 'Nationality': 'French'}]