# Acquire data & data structures

In [None]:
import requests
import pandas as pd

## Internet

#### Check the response status code
 - status code 200: the request response cycle was successful
 - any other status code: it didn't work (e.g., 404 = page not found)
 - Convert content to utf-8 if necessary

In [None]:
def connect(url, decode='utf-8'):
    response = requests.get(url)
    if response.status_code == 200:
        print('successfully connected, response code: {}'.format(response.status_code))
    else:
        print('connection failed')
    return response.content.decode(decode)

In [None]:
url = 'http://www.lauthom.nl/search/tools'
content = connect(url)
content[:500]

### JSON

In [None]:
import json

### json.loads recursively decodes a string in JSON format into equivalent python objects
 - data_string's outermost element is converted into a python list
 - the first element of that list is converted into a dictionary
 - the key of that dictionary is converted into a string
 - the value of that dictionary is converted into a list of two integer elements

In [None]:
data_string = '[{"b": [2, 4], "c": 3.0, "a": "A"}]'

python_data = json.loads(data_string)
print('{}\n{}\n{}\n{}\n{}\n{}'.format(type(data_string), type(python_data), python_data, python_data[0], python_data[0]['b'], python_data[0]['b'][1]))

### json.dumps and json.loads

In [None]:
JSON_string = "JSON throws exception when not in correct format"
print(JSON_string)

# Stringify strings
JSON_stringified = json.dumps(JSON_string)
print(JSON_stringified)

# Correct
json.loads(JSON_stringified)

# JSONDecodeError 
# json.loads(JSON_string)



### requests & JSON

In [None]:
address = 'Amsterdam, Netherlands'
url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}'.format(address)
response = requests.get(url).json()
type(response), response

### Get JSON formatted content

In [None]:
def get_json(url, decode='utf-8'):
    try:
        response = requests.get(url)
        if not response.status_code == 200:
            print('HTTP error, response code: {}'.format(response.status_code))
        else:
            try:
                response_data = response.json()
            except:
                print("response not in valid JSON format")
    except:
        print('something went wrong with requests.get')
        
    return response_data

In [None]:
response_data = get_json(url)
response_data

### Get address, latitude, longitude

In [None]:
def get_lat_lng(url):
    response = get_json(url)
    result = response['results'][0]
    formatted_address = result['formatted_address']
    lat = result['geometry']['location']['lat']
    lng = result['geometry']['location']['lng']
    return formatted_address, lat, lng

In [None]:
get_lat_lng(url)

In [None]:
address = 'London Business School'
url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}'.format(address)
get_lat_lng(url)

### Get list of addresses with lat, lon

In [None]:
def get_lat_lng_list(url):
    response = get_json(url)
    result_list = []
    for result in response['results']:
        formatted_address = result['formatted_address']
        lat = result['geometry']['location']['lat']
        lng = result['geometry']['location']['lng']
        result_list.append((formatted_address, lat, lng))
    return result_list

In [None]:
address = 'Baker Street'
url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}'.format(address)
get_lat_lng_list(url)

## XML
 - library lxml - deals with converting an XML-string to python objects and vice versa

In [None]:
from lxml import etree

In [None]:
data_string = """
<Bookstore>
   <Book ISBN="ISBN-13:978-1599620787" Price="15.23" Weight="1.5">
      <Title>New York Deco</Title>
      <Authors>
         <Author Residence="New York City">
            <First_Name>Richard</First_Name>
            <Last_Name>Berenholtz</Last_Name>
         </Author>
      </Authors>
   </Book>
   <Book ISBN="ISBN-13:978-1579128562" Price="15.80">
      <Remark>
      Five Hundred Buildings of New York and over one million other books are available for Amazon Kindle.
      </Remark>
      <Title>Five Hundred Buildings of New York</Title>
      <Authors>
         <Author Residence="Beijing">
            <First_Name>Bill</First_Name>
            <Last_Name>Harris</Last_Name>
         </Author>
         <Author Residence="New York City">
            <First_Name>Jorg</First_Name>
            <Last_Name>Brockmann</Last_Name>
         </Author>
      </Authors>
   </Book>
</Bookstore>
"""

In [None]:
root = etree.XML(data_string)
root.tag, type(root.tag)

In [None]:
print(etree.tostring(root, pretty_print=True).decode("utf-8"))

#### Iterating over complete XML tree

In [None]:
for element in root.iter():
    print(element)

#### Iterate over children in subtree, accessing tags

In [None]:
for child in root:
    print(child, child.tag)

#### Iterate to get specific tags and data
1. author tags are accessed
2. For each author tag, the .find function accesses the First_Name and Last_Name tags
3. The .find function only looks at the children, not other descendants, so be careful!
4. The .text attribute prints the text in a leaf node

In [None]:
for element in root.iter('Author'):
    print(element.find('First_Name').text, element.find('Last_Name').text)

#### Filter values of attributes
e.g. find the first name of the author of a book that weighs 1.5 oz

In [None]:
root.find('Book[@Weight="1.5"]/Authors/Author/First_Name').text

## Exchange rates from  XE.com

In [None]:
url = 'https://www.xe.com/currencyconverter/convert/?Amount=1&From=USD&To=EUR'

### BeautifulSoup

In [None]:
from bs4 import BeautifulSoup

In [None]:
def result_page(url, keywords=''):
    response = requests.get(url + keywords)
    if not response.status_code == 200:
        return None
    return BeautifulSoup(response.content, 'lxml')

In [None]:
def get_data(url, keywords='', selector=''):
    rate_list = []
    try:
        results_page = result_page(url, keywords)
        rates = results_page.find_all('td', class_='rateCell')
        
        for rate in rates:
            rate_ = rate.get_text()
            try:
                currency = rate.find('a').get('rel')[0][:7]
                rate_list.append((currency, rate_))
            except:
                currency = ''
            
        return rate_list
    except:
        return None

In [None]:
pd.DataFrame(get_data(url))