# Scraping abstracts for central bank research

In [1]:
# import sys
# !{sys.executable} -m pip install Scrapy
# !{sys.executable} -m pip install wordcloud
from bs4 import BeautifulSoup
import requests
from bs4 import NavigableString
import queue
from concurrent.futures import ThreadPoolExecutor
import itertools
import threading
import csv
import pandas as pd 
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import numpy as np
import re

### Function (1)
**Return a list with a link to each paper**
* NB: This is for a single page

In [2]:
def get_links(soup):
    
    links = []
    for link in soup.findAll('a', attrs={'href': re.compile("/p/")}):
        links.append("https://ideas.repec.org"+link.get('href'))

    return links 

### Function (2)
**Soup the necessary URL and call Function (1) to return the list of links**

In [3]:
def parse_links(START_URL):
    
    page   = requests.get(START_URL)
    soup   = BeautifulSoup(page.text, 'html.parser')
    links = get_links(soup)

    return links

### Function (3)  
**For a given central bank, go through each page and collect the links by calling Function (2) for each page**

1. use Function (2) to get all the links from the first page for the first central bank
2. access that URL's .html **again**
3. get the links that direct to the next pages i.e. [1],[2]... for that central bank
4. if the last item in that list is not empty get the link to the next page
5. return to 1. above using the link saved in 4.
6. if there and no pages, quit the loop

In [4]:
def parse_all_links(START_URL):
    
    items_list = [] 
    current_link = START_URL   # Setting up the starting link
    
    no_more_pages = False
    while no_more_pages == False:
        
        items_list = items_list + parse_links(current_link)   # Add items from page to the list
        request = requests.get(current_link)                   # Accessing text content again
        parsed_request = BeautifulSoup(request.content, 'html.parser')   # Parsing it again
        link = parsed_request.find_all('li', class_= "page-item")        # Finding next link from within tree
        try:
            if not link[-1].find('a')['href'] is False:
                current_link = current_link.rsplit('/', 1)[0]+'/'+link[-1].find('a')['href']
        except:
            no_more_pages = True
    items_list.pop(0)
    items_list.pop(0)
    return items_list

#### Call function (3)

In [5]:
links = parse_all_links("https://ideas.repec.org/s/rba/rbardp.html")         # RBA
links.extend(parse_all_links("https://ideas.repec.org/s/fip/fedgfe.html"))   # Fed
links.extend(parse_all_links("https://ideas.repec.org/s/ecb/ecbwps.html"))   # ECB
links.extend(parse_all_links("https://ideas.repec.org/s/boj/bojwps.html"))   # BoJ
links.extend(parse_all_links("https://ideas.repec.org/s/bca/bocawp.html"))   # Bank of Canada
links.extend(parse_all_links("https://ideas.repec.org/s/nzb/nzbdps.html"))   # RBNZ
links.extend(parse_all_links("https://ideas.repec.org/s/boe/boeewp.html"))   # BoE
links.extend(parse_all_links("https://ideas.repec.org/s/hhs/rbnkwp.html"))   # Riksbank
links.extend(parse_all_links("https://ideas.repec.org/s/bno/worpap.html"))   # Norges Bank

### Function (4) 
**For a given abstract page, pull the text of that abstract**

NB: We need the except pass conditions:
* before 1996 for the Fed (here: https://ideas.repec.org/s/fip/fedgfe9.html) the results are no longer interpretable by the program

In [6]:
def get_abstracts(URL):
    
    abstracts = []
    
    page      = requests.get(URL)
    soup      = BeautifulSoup(page.text, 'html.parser')
    
    abstract  = soup.find('div', id='abstract-body').get_text()
    
    try:
        
        info      = [desc.strip() for desc in 
                     soup.find("li",class_="list-group-item downfree").descendants if type(desc) == NavigableString]
        
        abstracts.append({'Author/s': info[0],'Year': info[0].split()[-2], 
                      'Title': info[1], 'ID': info[4], 'Abstract': abstract})
        
    except:
        pass
    
    return abstracts

#### Call function (4) using concurrency via threading!
* Give each thread a different task (URL)
* In this case, we will handle all the tasks at once
* This is great for the purpose at hand, as I have a list of pre-defined URLs to process

#### NB: Change the number of threads if the IDEAS page throws a request error

In [None]:
with ThreadPoolExecutor(72) as pool:
      abstracts = pool.map(get_abstracts, links)

abstracts = list(abstracts)

In [None]:
with open('_data_.csv', 'w', newline='') as csvfile:
    fieldnames = abstracts[0][0].keys()
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for x in rangenge(0,len(abstracts)):
        for row in abstracts[x]:
            writer.writerow(row)