# Webmining

In [1]:
!pip install requests beautifulsoup4



## Using the IMF API

API docs: https://www.imf.org/external/datamapper/api/help

In [24]:
import requests
import csv
import json
import time
from bs4 import BeautifulSoup

In [3]:
# First step: getting the list of indicators
response = requests.get('https://www.imf.org/external/datamapper/api/v1/indicators')
response.status_code

200

In [4]:
indicators_data = response.json()

In [5]:
INDICATORS = []

for indicator_key, indicator_metadata in indicators_data['indicators'].items():
    if not indicator_key:
        continue
        
    INDICATORS.append((indicator_key, indicator_metadata['label']))
    
len(INDICATORS)

127

In [6]:
INDICATORS[0]

('NGDP_RPCH', 'Real GDP growth')

In [7]:
BASE_URL = 'https://www.imf.org/external/datamapper/api/v1'

def retrieve_indicator_for_country(indicator, country):
    url = '{base_url}/{indicator}/{country}'.format(base_url=BASE_URL, indicator=indicator, country=country)
    r = requests.get(url)
    data = r.json()
    return data['values'][indicator][country]

In [8]:
retrieve_indicator_for_country('NGDP_RPCH', 'ECU')

{'1980': 4.9,
 '1981': 3.9,
 '1982': 1.2,
 '1983': -2.8,
 '1984': 4.2,
 '1985': 4.4,
 '1986': 3.1,
 '1987': -6,
 '1988': 10.5,
 '1989': 0.3,
 '1990': 3,
 '1991': 5.1,
 '1992': 3.6,
 '1993': 2,
 '1994': 4.3,
 '1995': 2.3,
 '1996': 1.7,
 '1997': 4.3,
 '1998': 3.3,
 '1999': -4.7,
 '2000': 1.1,
 '2001': 4,
 '2002': 4.1,
 '2003': 2.7,
 '2004': 8.2,
 '2005': 5.3,
 '2006': 4.4,
 '2007': 2.2,
 '2008': 6.4,
 '2009': 0.6,
 '2010': 3.5,
 '2011': 7.9,
 '2012': 5.6,
 '2013': 4.9,
 '2014': 3.8,
 '2015': 0.1,
 '2016': -1.2,
 '2017': 2.4,
 '2018': 1.3,
 '2019': 0,
 '2020': -7.8,
 '2021': 4.2,
 '2022': 3,
 '2023': 2.9,
 '2024': 2.8,
 '2025': 2.8,
 '2026': 2.8,
 '2027': 2.8,
 '2028': 2.8}

In [9]:
with open('imf.csv', 'w', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['country', 'indicator', 'year', 'value'])
    writer.writeheader()

    for indicator_name, indicator_label in INDICATORS[:5]:
        print('Retrieving', indicator_name)
        
        data = retrieve_indicator_for_country(indicator_name, 'ECU')
        
        for year, value in data.items():
            writer.writerow({'year': year, 'value': value, 'country': 'ECU', 'indicator': indicator_name})

        time.sleep(2)

Retrieving NGDP_RPCH
Retrieving NGDPD
Retrieving NGDPDPC
Retrieving PPPGDP
Retrieving PPPPC


## Scraping EchoJS

In [11]:
response = requests.get('https://www.echojs.com/')

In [13]:
soup = BeautifulSoup(response.text)

In [14]:
type(soup)

bs4.BeautifulSoup

In [17]:
articles = soup.select('#newslist article')
len(articles)

30

In [18]:
articles[0]

<article data-news-id="41680"><a class="uparrow" href="#up">▲</a> <h2><a href="https://github.com/Exact-Realty/routemate" rel="nofollow">Routemate: Simple Multi-Runtime JS Router</a></h2> <address>at github.com</address><a class="downarrow" href="#down">▼</a><p><span class="upvotes">1</span> up and <span class="downvotes">0</span> down, posted by <username><a href="/user/tinnyste">tinnyste</a></username> 8 hours ago <a href="/news/41680">discuss</a></p></article>

In [22]:
scraped_data = []

for article in articles:
    link = article.select_one('h2 > a')
    title = link.get_text().strip()
    url = link.get('href')
    
    data = {'title': title, 'url': url}
    
    # NOTE: same thing as this
    data = {}
    data['title'] = title
    data['url'] = url
    
    scraped_data.append(data)
    
len(scraped_data)

30

In [23]:
scraped_data[0]

{'title': 'Routemate: Simple Multi-Runtime JS Router',
 'url': 'https://github.com/Exact-Realty/routemate'}

In [26]:
with open('scraped.json', 'w', encoding='utf-8') as f:
    json.dump(scraped_data, f, indent=2)

## Scraping Angular 2 HN?

`li.post` seems to be a good selector.

[https://angular2-hn.firebaseapp.com/news/1](https://angular2-hn.firebaseapp.com/news/1)

In [27]:
response = requests.get('https://node-hnapi.herokuapp.com/news?page=1')

In [29]:
with open('data.json', 'w', encoding='utf-8') as f:
    f.write(response.text)