# Academic abstract collection

In [1]:
# import sys
# !{sys.executable} -m pip install Scrapy
# !{sys.executable} -m pip install wordcloud
from bs4 import BeautifulSoup
import requests
from bs4 import NavigableString
import queue
from concurrent.futures import ThreadPoolExecutor
import itertools
import threading
import csv
import pandas as pd 
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import numpy as np
import re

### Function (1)
**Return a list with a link to each paper**
* NB: This is for a single page

In [2]:
def get_links(soup):  
    
    papers = []
    for paper in soup.findAll("li",class_="list-group-item downgate"):
        papers.append("https://ideas.repec.org"+paper.a.get('href'))
                
    return(papers)

### Function (2)
**Soup the necessary URL and call Function (1) to return the list of links**

In [3]:
def parse_links(START_URL):
    
    page   = requests.get(START_URL)
    soup   = BeautifulSoup(page.text, 'html.parser')
    links = get_links(soup)

    return links

In [4]:
links = parse_links("https://ideas.repec.org/s/eee/jmacro.html")       # Journal of Macroeconomics
links.extend(parse_links("https://ideas.repec.org/s/eee/jmacro2.html"))
links.extend(parse_links("https://ideas.repec.org/s/eee/jmacro3.html"))
links.extend(parse_links("https://ideas.repec.org/s/eee/jmacro4.html"))
links.extend(parse_links("https://ideas.repec.org/s/eee/jmacro5.html"))
links.extend(parse_links("https://ideas.repec.org/s/eee/jmacro6.html"))
links.extend(parse_links("https://ideas.repec.org/s/eee/jmacro7.html"))

In [5]:
JME = len(links)

In [6]:
links.extend(parse_links("https://ideas.repec.org/s/bpj/bejmac.html")) # The B.E. Journal of Macroeconomics
links.extend(parse_links("https://ideas.repec.org/s/bpj/bejmac2.html")) 
links.extend(parse_links("https://ideas.repec.org/s/bpj/bejmac3.html")) 

In [7]:
BEJME = len(links) - JME

print("BEJME:",BEJME,"JME:", JME)
print()
print("Total:",len(links))

BEJME: 581 JME: 1400

Total: 1981


In [8]:
def get_abstracts(URL):
    
    abstracts = []
    
    page      = requests.get(URL)
    soup      = BeautifulSoup(page.text, 'html.parser')
    
    abstract  = soup.find('div', id='abstract-body').get_text()
    
    try:
        
        info      = [desc.strip() for desc in 
                     soup.find("li",class_="list-group-item downgate").descendants if type(desc) == NavigableString]
        
        abstracts.append({'Author/s': info[0],'Year': info[0].split()[-2], 
                      'Title': info[1], 'ID': info[3], 'Abstract': abstract})
        
    except:
        pass
    
    return abstracts

In [9]:
len(links)

1981

In [12]:
with ThreadPoolExecutor(10) as pool:
      abstracts = pool.map(get_abstracts, links)

aca_abstracts = list(abstracts)

In [13]:
with open('macro_data.csv', 'w', newline='') as csvfile:
    fieldnames = aca_abstracts[0][0].keys()
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for x in range(0,len(aca_abstracts)):
        for row in aca_abstracts[x]:
            writer.writerow(row)

In [None]:
# links.extend(parse_links("https://ideas.repec.org/s/ucp/jlabec.html")) #Journal of labour economics
# links.extend(parse_links("https://ideas.repec.org/s/ucp/jlabec2.html"))
# links.extend(parse_links("https://ideas.repec.org/s/ucp/jlabec3.html"))

In [None]:
# links.extend(parse_links("https://ideas.repec.org/s/eee/jfinec.html")) # Journal of Financial Economics
# links.extend(parse_links("https://ideas.repec.org/s/eee/jfinec1.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/jfinec2.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/jfinec3.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/jfinec4.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/jfinec5.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/jfinec6.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/jfinec7.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/jfinec8.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/jfinec9.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/jfinec10.html"))

In [None]:
# links.extend(parse_links("https://ideas.repec.org/s/eee/moneco.html")) # Journal of Monetary Economics
# links.extend(parse_links("https://ideas.repec.org/s/eee/moneco2.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/moneco3.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/moneco4.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/moneco5.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/moneco6.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/moneco7.html"))
# links.extend(parse_links("https://ideas.repec.org/s/eee/moneco8.html"))