In [1]:
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
import re
import time

#the start of the scraper
start = time.time()

volume_issue_links = []
for x in range(1,2):
    
    #this will get all volume issues pages for me to crawl
    html = requests.get('http://www.irrodl.org/index.php/irrodl/issue/archive?issuesPage={}#issues'.format(x))
    soup = BeautifulSoup(html.content,"lxml")
    volume_issue_links += [x['href'] for x in soup('a') if x.text.find('Vol') > -1]
    
    #we are only allowed to slow crawl IRRODL
    #pausing for 1 second between requests should be sufficient
    time.sleep(1)

article_links = []
for issue in [volume_issue_links[0]]:
    
    #this will get article links that I should scrape
    html = requests.get(issue)
    soup = BeautifulSoup(html.content,"lxml")
    article_links += [x['href'] for x in soup('a') if x.text.find('HTML') > -1]
    
    #pause for 1 second in between requests to be nice to IRRODL's servers
    time.sleep(1)

In [8]:
IRRODL = []

for link in article_links[0:5]:
    
    html = requests.get(link)
    soup = BeautifulSoup(html.content,"lxml")

    #get all of these things out of the webpage soup
    try:
        article_title = soup.find("meta", {"name":"description"})['content']
    except:
        article_title = ''
    try:
        keywords = soup.find("meta", {"name":"keywords"})['content'].split(': ')[1].split('; ')
    except:
        keywords = ''
    try:
        authors_list = [x['content'] for x in soup.find_all("meta", {"name":"DC.Creator.PersonalName"})]
    except:
        authors_list = ''
    try:
        affiliation_list = [x['content'] for x in soup.find_all("meta", {"name":"citation_author_institution"})]
    except:
        affiliation_list = ''
    try:
        author_affiliation = zip(authors_list,affiliation_list)
    except:
        author_affiliation = ''
    try:
        abstract = soup.find("meta", {"name":"DC.Description"})['content'].split(': ')[1]
    except:
        abstract = ''

    #remove unnecessary spaces in the abstract
    for x in range(10):
        abstract = abstract.replace('  ',' ')
    abstract = unidecode(abstract)

    try:
        link_to_article = soup.find("meta", {"name":"DC.Identifier.URI"})['content']
    except:
        link_to_article = ''
    try:
        article_type = soup.find("meta", {"name":"DC.Type.articleType"})['content']
    except:
        article_type = ''
    

    
    texts = soup.findAll(text=True)
    def visible(element):
        if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
            return False
        elif re.match('<!--.*-->', str(unidecode(element))):
            return False
        elif element == '\n' or element == u'\xa0' or element == ' ':
            return False
        return True

    visible_texts = filter(visible, texts)
    visible_texts = ' '.join(visible_texts)

    article_text = unidecode(visible_texts).replace('\n','').replace('\t','').replace('\r','')

    for x in range(10):
        article_text = article_text.replace('  ',' ')
        
    IRRODL.append({
            'article_title':article_title,
            'keywords':keywords,
            'authors_list':authors_list,
            'affiliation_list':affiliation_list,
            'author_affiliation':author_affiliation,
            'abstract':abstract,
            'link_to_article':link_to_article,
            'article_type':article_type,
            'article_text':article_text,
            'soup':soup
        })
    
    #wait for a second between requests to be nice to IRRODL's server
    time.sleep(1)

#the end of the scraper
end = time.time()

In [250]:
#time it took to scrape all of IRRODL
#print (end - start)/60.
#48.08125 minutes

In [243]:
# #code to convert IRRODL list of dictionaries into pandas dataframe
# import pandas as pd
# df = pd.DataFrame(IRRODL)

In [247]:
# export IRRODL to csv
# df.to_csv('C:\Users\\bodil\Projects\IRRODL Scraper\IRRODL_articles2.csv',index=False,encoding='utf-8')

In [18]:
import pandas as pd
df = pd.DataFrame(IRRODL)

In [1]:
import pandas as pd
from unidecode import unidecode
import re

df = pd.read_csv('IRRODL_articles.csv',encoding='utf-8')

In [5]:
json_string = df.to_json()

In [1]:
import json
json_object = json.loads(json_string)

In [2]:
researchdf = df[df.article_type == 'Research Articles'].reset_index(drop=True).copy()

In [3]:
researchdf['authors_list2'] = researchdf['authors_list'].map(lambda x: re.split("\[|\]|, ",x)[1:-1])
researchdf['authors_list2'] = researchdf['authors_list2'].map(lambda x: [unidecode(y).replace('.','') for y in x])

author_list = []
for authors in researchdf.authors_list2:
    for author in authors:
        author_list.append(author)

In [4]:
author_counts = pd.Series(author_list).value_counts()

In [155]:
def affiliation_fun(cell):
    affil_list = [unidecode(y).replace('.','') for y in cell]
    affil_list = [y[y.find(',')+2:] for y in affil_list]
    for y in affil_list:
        if y == 'David Wiley':
            print affil_list
    return affil_list

researchdf['affiliation_list2'] = researchdf['author_affiliation'].map(lambda x: re.split("\), \(|\[\(|\)\]",x)[1:-1])
researchdf['affiliation_list2'] = researchdf['affiliation_list2'].map(affiliation_fun)

    
affiliation_list = []
for affiliations in researchdf.affiliation_list2:
    for affiliation in affiliations:
        affiliation_list.append(affiliation)

In [2]:
university_counts = pd.Series(affiliation_list).value_counts()

In [220]:
#lowercase all of the words in the title and aggregate all words into the same list
title_word_list = []
for words in researchdf.article_title.map(lambda x: x.split(' ')):
    for word in words:
        title_word_list.append(word.lower())

In [234]:
#import stopwords to remove them from the title word counts
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

In [3]:
#top title words in IRRODL all-time after removing stop words
top_title_words = pd.Series(title_word_list)[~pd.Series(title_word_list).isin(stop)].value_counts()

In [None]:
#code to look at a specific article and try out some new parsing strategies

In [179]:
import requests
from bs4 import BeautifulSoup
html = requests.get('http://www.irrodl.org/index.php/irrodl/article/view/5/338')
soup = BeautifulSoup(html.content,'lxml')

In [192]:
soup('p')[2].text.split(' \r\n  ')

[u'Hilary Perraton',
 u'Director, International Research Foundation for Open Learning\r\n']

In [12]:
df.link_to_article[0]

u'http://www.irrodl.org/index.php/irrodl/article/view/2844'

In [14]:
#word_cloud text
df.article_text[0]

u' ISSN: 1492-3831 IRRODL Co-Editors: Dianne Conrad Rory McGreal Journal Content Search Search Scope All Authors Title Abstract Index terms Full Text Browse By Issue By Author By Title User Username Password Remember me Article Tools Print this article Indexing metadata How to cite item Email this article (Login required) Email the author (Login required) Information For Readers For Authors For Reviewers For Librarians Add javascript required for font sizer Font Size SUBSCRIBE TO MAILING LIST About The Author Rory McGreal Athabasca University Journal Help Open Journal Systems Home About Register Current Archives Announcements What\'s New Resources Conferences Home > Vol 17, No 4 (2016) > McGreal html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" International Review of Research in Open and Distributed Learning Volume 17, Number 4 June - 2016 Editorial Rory McGreal Co-Editor, Athabasca University In this issue of IRRODL, we are catching up