In [1]:
import re
import nltk
import string
import requests
import operator
from bs4 import BeautifulSoup
import pandas as pd 
import pickle
from collections import Counter
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

from time import sleep
from random import randint

stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kosit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kosit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Get the HTML tags and content of a web page
Let’s define a function to request and parse a HTML web page as we will need this a lot during this tutorial:

In [2]:
class GoogleScholarScraper:
    
    def __init__(self, url):
        self.url = url
        
        
    # request and parse a HTML web page
    def getAndParseURL(self, url):
        """
        Parses the HTML web page and returns the content
        and HTML tags for the page

        args:
          url: URL to a web page

        """
        result = requests.get(url)
        soup = BeautifulSoup(result.text, 'html.parser') 
        
        return soup
    
    
    # retrieve the Staff profile URLs on any page
    def getStaffProfileURLs(self, url):
        """
        Gets all the Staff Profile URLs for any given web page

        args:
          url: URL to a web page

        returns:
          A list of all Staff Profile URLs on a given web page
        """
        # get the tags and webpage content

        # remove the citation part of the base url and
        # replace with the citation part for each Staff
        
        soup = self.getAndParseURL(url)
        
        listOfStaffURLs = ["/".join(url.split("/")[:-1]) + x.div.a.get("href") \
                         for x in soup.findAll("div", class_="gsc_1usr")]
        
        return listOfStaffURLs
    
    
    def getAllProfilePagesURL(self, url):
        """
        Gets the URL of all profile pages of Coventry Univesity on Google Scholar 

        args:
          url: the starting or main page

        returns:
          pages_urls: a list of URLs for all Coventry University Google Scholar 
                      Profile pages
        """

        # store the URLs of all the pages
        pages_urls = [url]

        # get the html tags and contents of the main page
        soup = self.getAndParseURL(pages_urls[0])

        cite = soup.find("button", class_="gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx") \
        .get("onclick").split("/")[1].replace("\\x3d","=").replace("\\x26", "&").replace("&oe=ASCII&", "&") \
        .split("'")[0]
        
        new_url = "https://" + self.url.split("//")[1].split("/")[0] + "/" + cite

        while requests.get(new_url).status_code == 200:
            pages_urls.append(new_url)

            try:
                # Controlling the crawl rate
                sleep(randint(2,10))

                soup = self.getAndParseURL(new_url)
                cite = soup.find("button", class_="gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx") \
                .get("onclick").split("/")[1].replace("\\x3d","=").replace("\\x26", "&") \
                .replace("&oe=ASCII&", "&").split("'")[0]
                
                new_url = "https://" + self.url.split("//")[1].split("/")[0] + "/" + cite

            except AttributeError:
                break

        return pages_urls
    
    
    
    def getAllStaffProfileURLs(self, url):
        """
        Gets the profile url of every member of Staff on each Coventry University
        Google Scholar Profile page.

        args:
          pages_urls: a list of URLs for all Coventry University Google Scholar 
                      Profile pages

        returns:
          staffURLs: a list of URLs of all the profiles of Coventry University 
                     member of Staff on Google Scholar
        """
        pages_urls = self.getAllProfilePagesURL(url)
        allStaffURLs = []

        for page in pages_urls:
            # get all Staff profile URLs on each page
            allStaffURLs.extend(self.getStaffProfileURLs(page))

            # Controlling the crawl rate
            sleep(randint(2,10))

        return allStaffURLs
    
    
    
    def getAllPublications(self, url):
    
        publications = {"staffName" : [],
                    "researchArea" : [],
                    "title" : [],
                    "cited" : [],
                    "year" : []}
        
        # Get the URLs of each Staff on all pages
        allStaffProfileURLs = self.getAllStaffProfileURLs(url)
        
        
        # get all the publications for on each Staff profile page
        for staffProfileURL in allStaffProfileURLs:

            staffName = ''
            researchAreas = []
            publication_titles = []
            num_citations = []
            year_publication = []


            # get the tags and webpage content on each Staff profile page
            soup = self.getAndParseURL(staffProfileURL)


            # get the staff name
            staffName = soup.find("div", {"id":"gsc_prf_in"}).text


            # get the list of research area
            for div in soup.findAll("div", {"class" : "gsc_prf_il"}):
                area_tags = div.findAll("a")

            for area in area_tags:
                researchAreas.append(area.get_text())


            # get the title of each publication
            publication_tags = soup.find_all("td", class_="gsc_a_t")
            for publication in publication_tags:
                publication_titles.append(publication.find("a").text)


            # Controlling the crawl rate
            sleep(randint(2,10))

            # get the number of citations for each publication
            cited_tags = soup.find_all("a", class_="gsc_a_ac gs_ibl")
            for cited in cited_tags:
                num_citations.append(cited.text)


            # get the year of publication for each publication
            year_tags = soup.find_all("span", class_="gsc_a_h gsc_a_hc gs_ibl")
            for year in year_tags:
                year_publication.append(year.text)


            publications["staffName"].append(staffName)
            publications["researchArea"].append(researchAreas)
            publications["title"].append(publication_titles)
            publications["cited"].append(num_citations)
            publications["year"].append(year_publication)
            
            
        return publications

## Function to save and load data

In [3]:
def saveAndLoadData(data, file_path = "file_path"):
    
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)
    
    with open (file_path, 'rb') as fp:
        load_data = pickle.load(fp)
        
    return load_data

In [4]:
file_path = "./data/scraped_publications"
with open (file_path, 'rb') as fp:
    publications_dict = pickle.load(fp)

# Final Solution

#### TODO: check if data exist in file before crawling

In [5]:
# Google Scholar page for Coventry University:
main_url = "https://scholar.google.co.uk/citations?view_op=view_org&hl=en&org=9117984065169182779"

# Instantiate the crawler
crawler = GoogleScholarScraper(main_url)

# Get the URLs of each Staff on all pages
# publications_dict = crawl.getAllPublications(main_url)      # commented to to be able to run the code

# save scraped data
file_path = "./data/scraped_publications"
unprocessed_publications = saveAndLoadData(publications_dict, file_path)

# stored unprocessed data in a dataframe
unprocessed_publications_df = pd.DataFrame.from_dict(unprocessed_publications)
unprocessed_publications_df.head()

Unnamed: 0,staffName,researchArea,title,cited,year
0,Timothy Mason,"[sonochemistry, ultrasound, chemistry, environ...","[Sonochemistry, Applied sonochemistry: the use...","[2068, 1503, 1178, 910, 798, 667, 475, 459, 42...","[1988, 2002, 1996, 1991, 1997, 2001, 2010, 199..."
1,Gurnam Singh,"[social work, race and racism, critical pedago...",[Mitochondrial DNA mutation associated with Le...,"[2629, 2554, 400, 364, 299, 249, 245, 214, 205...","[1988, 2005, 2000, 1981, 2006, 2007, 1960, 200..."
2,WD Li,[],[Evaluation of the role of phonological STM in...,"[1788, 1117, 588, 384, 314, 306, 230, 226, 213...","[1989, 1992, 2013, 2013, 2015, 2005, 2007, 200..."
3,Dr. Mohammad M Ali,"[Forecast Information Sharing, ARIMA Modelling...",[Aquatic toxicity from pulp and paper mill eff...,"[821, 604, 579, 182, 105, 104, 99, 95, 93, 91,...","[2001, 1995, 2004, 2013, 2013, 1995, 2000, 201..."
4,Petra Wark,"[m/eHealth, epidemiology, primary prevention, ...","[World malaria report 2015, Alcohol attributab...","[7563, 343, 279, 271, 235, 234, 153, 141, 132,...","[2016, 2011, 2012, 2013, 2014, 2013, 2013, 201..."


# Data Preprocessing
### Select name of Staff and publication titles, and unpack into a dataframe

Observing the dataset shows that there are publications with missing publications and number of times the publications have been cited. So we will only work with the names and publication titles of the Staff.

In [6]:
# get the staff names and publications title
pub_Staff_Title = unprocessed_publications_df[["staffName", "title", "year"]]

# create list to hold name of each staff and the publications
name = []
title = []
publication_year = []

# loop through each row, then loop through list of publications
# titles, and extract each title, and also the name of the Staff
for i in range(len(pub_Staff_Title["title"])):
    for x in range(len(pub_Staff_Title["title"][i])):
        title.append(pub_Staff_Title["title"][i][x])
        name.append(pub_Staff_Title["staffName"][i])
        
        
# Extract the publication year for each publication
for i in range(len(pub_Staff_Title["title"])):
    for x in range(len(pub_Staff_Title["year"][i])):
        year = pub_Staff_Title["year"][i][x]
        
        if year is None:
            publication_year.append("missing")
        else:
            publication_year.append(year)
        
        
# create dictionary of Staff name and publications
processed_publications_dict = {"StaffName" : name,
                               "Titles" : title,
                              "Year" : publication_year}

# create dataframe of Staff name and publications
processed_publications = pd.DataFrame(processed_publications_dict)

# save processed data
file_path = "./data/processed_publications"
processed_publications = saveAndLoadData(processed_publications, file_path)

# display first 5 rows
processed_publications.head()

Unnamed: 0,StaffName,Titles,Year
0,Timothy Mason,Sonochemistry,1988
1,Timothy Mason,Applied sonochemistry: the uses of power ultra...,2002
2,Timothy Mason,The uses of ultrasound in food technology,1996
3,Timothy Mason,Practical sonochemistry,1991
4,Timothy Mason,Ultrasound in synthetic organic chemistry,1997


In [7]:
processed_publications.shape

(10212, 3)

### Save the final data to file

In [8]:
# Convert the titles to a list
publication_titles = processed_publications['Titles'].values.tolist()

# saved the list of publication titles to file
file_path = "./data/publication_titles"
publication_titles = saveAndLoadData(publication_titles, file_path)

# Create the search engine

In [9]:
class SearchEngine:
    
    def __init__(self, publications, stopwords, query):
        self.publications = publications
        self.stopwords = stopwords
        self.query = query
        
        
    def tokenize(self, publications):
        """
            Create tokens or distinct words from a string.

            args:
                document: document or search query to tokenize

            returns:
                tokens_list: list of lists of token
        """
        tokens = re.findall('\w+', str(publications).lower())
        tokens_list = [word for word in tokens if word not in self.stopwords and word.isalpha()]

        return tokens_list
    
    
    def create_tokens(self):
        
        tokens = [self.tokenize(p) for p in self.publications]
        
        return tokens
        
    
    def create_inverted_index(self):
        """
        Create an inverted index given a list of document tokens. The index maps
        each unique word to a list of document ids, sorted in increasing order.

        args:
          tokens...A list of lists of strings
        returns:
          An inverted index. This is a dict where keys are words and values are
          lists of document indices, sorted in increasing order."""

        tokens = self.create_tokens()
        inverted_index = {}
        
        for idx, document in enumerate(tokens):
            for token in document:
                if token not in inverted_index:
                    inverted_index[token] = [idx]
                else:
                    inverted_index[token].append(idx)
                    
        return inverted_index
    
    
    def sort_by_num_postings(self):

        num_dict = {}
        
        token_query = self.tokenize(self.query)
        inverted_index = self.create_inverted_index()
        
        for word in token_query:
            num_dict[word] = len(inverted_index[word])
        num_dict = sorted(num_dict, key=num_dict.get)

        return num_dict
    
    
    def publication_search(self):
    
        token_indexes = []

        # get dictionary of sorted tokens
        tokens = self.sort_by_num_postings()
        
        # return the indexes of the publications
        indexes = self.create_inverted_index()

        for token in tokens:
            token_indexes.extend(indexes[token])

        cnt = dict(Counter(token_indexes))
        sorted_cnt = sorted(cnt.items(), key=operator.itemgetter(1), reverse=True)
        sorted_indexes = {k: v for k, v in sorted_cnt}

        return sorted_indexes.keys()

# Perform Search
### if search word is not a word, return nothing

In [10]:
# enter the search words
query = input("ENTER QUERY: ", )
print()

import time

# Beginning time of execution
start = time.time()

# Instantiate the search engine class
search_engine = SearchEngine(publication_titles, stopwords, query)

# get the results of the search
publication_results = search_engine.publication_search()

# display returned results
for r in publication_results:
    print(dict(processed_publications.loc[r]))
    print()
    
end = time.time()
print("About %2d results in %.2f seconds" % (len(publication_results), (end - start)))

ENTER QUERY: erosion

{'StaffName': 'Professor Damian Lawler', 'Titles': 'The measurement of river bank erosion and lateral channel change: a review', 'Year': '1993'}

{'StaffName': 'Professor Damian Lawler', 'Titles': 'Process dominance in bank erosion systems', 'Year': '1992'}

{'StaffName': 'Professor Damian Lawler', 'Titles': 'Downstream change in river bank erosion rates in the Swale–Ouse system, northern England', 'Year': '1999'}

{'StaffName': 'Professor Damian Lawler', 'Titles': 'Bank erosion and instability', 'Year': '1997'}

{'StaffName': 'Professor Damian Lawler', 'Titles': 'River bank erosion and the influence of frost: a statistical examination', 'Year': '1986'}

{'StaffName': 'Professor Damian Lawler', 'Titles': 'A new technique for the automatic monitoring of erosion and deposition rates', 'Year': '1991'}

{'StaffName': 'Professor Damian Lawler', 'Titles': 'Bank erosion events and processes in the Upper Severn basin', 'Year': '1997'}

{'StaffName': 'Professor Damian Lawl