In [3]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import json
import re
import pandas as pd

### Collect Conventry Papers

In [13]:
# function to scrape coventry publications page

def scrapPapers(start_page = 1, page_limit = 1000):

    page = start_page
    url = f"https://pureportal.coventry.ac.uk/en/publications/?format=&page={page}"

    papers = []

    while page < page_limit:

        try:
            pageSource = requests.get(url).text

            soup = BeautifulSoup(pageSource, "html.parser")
            
            paperLists = soup.select(".list-result-item")

            if len(paperLists) == 0:
                break

            for paper in paperLists:
                paperInfo = {}

                paperInfo['link'] = paper.select_one('h3.title a')['href']
                paperInfo['title'] = paper.select_one('h3.title a').text

                journal = paper.select_one('a', attrs = {'rel' : 'Journal'})
                paperInfo['journal'] = journal.text
                paperInfo['journalLink'] = journal['href']

                cols = ['date', 'volume', 'pages', 'numberofpages', 'type_classification']

                for x in cols:
                    try:
                        paperInfo[x] = paper.select_one(f'span.{x}').text

                        if x == 'numberofpages':
                            paperInfo[x] = int(paperInfo[x][:-2])
                        elif x == 'pages':
                            paperInfo[x] = paperInfo[x][3:]
                        elif x == 'volume':
                            paperInfo[x] = int(paperInfo[x])

                    except:
                        pass
                
                papers.append(paperInfo)

            print(f"Finished {page} ")
            
            page += 1
            url = f"https://pureportal.coventry.ac.uk/en/publications/?format=&page={page}"

        except: 
            break

    return papers

def getAuthorsAndOtherDocumentInformation(paperInfo):
    
    source = requests.get(paperInfo['link']).text
    paperSoup = BeautifulSoup(source, "html.parser")
    
    if paperSoup.select_one("div.doi a") is not None:
        paperInfo['doi'] = paperSoup.select_one("div.doi a")['href']
    
    persons = paperSoup.select_one("p.relations.persons")
    
    if persons is not None:
        paperInfo['authors'] = list(map(
                lambda x : x.strip(), 
                persons.text.split(','))
            )
    
    paperInfo['tags'] = [span.text for span in 
            paperSoup.select("li.userdefined-keyword")]

    paperInfo['coventryAuthors'] = [a['href'] for a in 
            persons.select('a', attrs = { 'rel' : 'Person'})]
    
    abstract = paperSoup.select_one(".rendering_researchoutput_abstractportal")

    paperInfo['abstract'] = None

    if abstract:
        paperInfo['abstract'] = abstract.text
    
    # return paperInfo

In [7]:
papers = scrapPapers(start_page = 1)
len(papers)

Finished 1 
Finished 2 


100

In [32]:
# with open("papers.json", "w") as f:
#     f.write(json.dumps(papers))

In [16]:
with open("papers2.json", "r") as f:
    papers = json.loads(f.read())
print(f"Loaded {len(papers)} papers")

Loaded 34164 papers


In [18]:
getAuthorsAndOtherDocumentInformation(papers[0])
papers[0]

{'link': 'https://pureportal.coventry.ac.uk/en/publications/associations-between-sleep-related-heart-rate-variability-and-bot',
 'title': 'Associations between sleep-related heart rate variability and both sleep and symptoms of depression and anxiety: A systematic review',
 'journal': 'Associations between sleep-related heart rate variability and both sleep and symptoms of depression and anxiety: A systematic review',
 'journalLink': 'https://pureportal.coventry.ac.uk/en/publications/associations-between-sleep-related-heart-rate-variability-and-bot',
 'date': 'Jan 2023',
 'volume': 101,
 'pages': '106-117',
 'numberofpages': 12,
 'type_classification': 'Review article',
 'doi': 'https://doi.org/10.1016/j.sleep.2022.10.018',
 'authors': ['Arron Taylor Lund Correia',
  'Gosia Lipinska',
  'H.G. Laurie Rauche',
  'Philippa Forshaw',
  'Laura Roden',
  'Dale E Rae'],
 'tags': ['Anxiety',
  'Autonomic nervous system',
  'Depression',
  'Heart rate variability (HRV)',
  'Insomnia',
  'PTSD',

In [19]:
import threading, subprocess, time
 
num_threads = 20
lock = threading.Lock()
last = time.time()

def scrapePapersParallel(papers, start, end):
    '''
        in the range [start, end)
    '''

    last = time.time()

    for i in range(start, end):
        try:
            getAuthorsAndOtherDocumentInformation(papers[i])

            if i % 100 == 0:
                print(f"Finished {i}")
        except:
            print(f"Failed Index {i}")

        try:
            now = time.time()

            # write to file here synchronize against threads
            if threading.current_thread().name == "Thread-0":
                if (now - last) >= 60:
                    lock.acquire()
                    last = now

                    with open("./papers1.json", "w") as f:
                        f.write(json.dumps(papers))
                    
                    print(f"Wrote to file")
                    lock.release()
        except:
            print("Error writing to file")
            
    print(f"Finished thread {threading.current_thread().name}")


In [20]:
blockSizes = len(papers) // num_threads

startInds = [i * blockSizes for i in range(0, num_threads)]
startInds.append(len(papers))
print(startInds)

threads = []

for ind in range(num_threads):
    new = threading.Thread(
        target=scrapePapersParallel,
        args=(papers, startInds[ind], startInds[ind + 1]),
        name=f"Thread-{ind}"
    )

    threads.append(new)
    new.start()

over_threads = iter(threads)
curr_th = next(over_threads)

while True:
    curr_th.join()
    if curr_th.is_alive():
        continue
    try:
        curr_th = next(over_threads)
    except StopIteration:
        break

[0, 1708, 3416, 5124, 6832, 8540, 10248, 11956, 13664, 15372, 17080, 18788, 20496, 22204, 23912, 25620, 27328, 29036, 30744, 32452, 34164]
Finished 0
Finished 20500
Finished 18800
Wrote to file
Finished 17100
Finished 15400
Wrote to file
Finished 13700
Wrote to file
Finished 12000
Finished 32500
Finished 10300
Wrote to file
Finished 30800
Finished 8600
Wrote to file
Finished 29100
Finished 27400
Finished 6900
Wrote to file
Finished 5200
Finished 25700
Finished 3500
Wrote to file
Finished 24000
Finished 22300
Finished 1800
Wrote to file
Finished 20600
Finished 100
Finished 18900
Finished 17200
Wrote to file
Wrote to file
Finished 15500
Finished 13800
Finished 12100
Wrote to file
Finished 32600
Finished 10400
Finished 30900
Finished 29200
Finished 27500
Finished 8700
Wrote to file
Finished 7000
Finished 25800
Finished 5300
Wrote to file
Finished 24100
Finished 3600
Finished 22400
Finished 1900
Wrote to file
Finished 20700
Finished 19000
Finished 200
Finished 17300
Wrote to file
Finished 

In [16]:
# with open("papers.json", "w") as f:
#     f.write(json.dumps(papers))

In [2]:
with open("papers1.json", "r") as f:
    papers = json.loads(f.read())

papers[100]

{'link': 'https://pureportal.coventry.ac.uk/en/publications/dimensions-of-cybersecurity-performance-and-crisis-response-in-cr',
 'title': 'Dimensions of cybersecurity performance and crisis response in critical infrastructure organisations: an intellectual capital perspective',
 'journal': 'Dimensions of cybersecurity performance and crisis response in critical infrastructure organisations: an intellectual capital perspective',
 'journalLink': 'https://pureportal.coventry.ac.uk/en/publications/dimensions-of-cybersecurity-performance-and-crisis-response-in-cr',
 'date': '21 Mar 2023',
 'volume': 24,
 'pages': '465-486',
 'numberofpages': 22,
 'type_classification': 'Article',
 'doi': 'https://doi.org/10.1108/JIC-06-2021-0166',
 'authors': ['Alexeis Garcia-Perez', 'Mark Sallos', 'Pattanapong Tiwasing'],
 'tags': ['COVID-19',
  'Cyber crisis response',
  'Cybersecurity capabilities',
  'Digital resilience',
  'cybersecurity performance'],
 'coventryAuthors': ['https://pureportal.coventry.

In [5]:
papersDf = pd.read_json("./scrapedData/papers.json")
# papersDf

# export as csv
papersDf.to_csv("./scrapedData/papers.csv", index=False)

### Collect Author Profiles

In [9]:
def getProfileURLorNone(url):

    if "no-content" in url:
        return None
    
    pattern = r"^(\/[^?]+)"
    path_match = re.match(pattern, url)
    path = None

    if path_match:
        path = path_match.group(1)

    return path    

def scrapeAuthors(start_page = 1, page_limit = 1000):

    page = start_page
    url = f"https://pureportal.coventry.ac.uk/en/persons/?format=&page={page}"

    authors = []

    while page < page_limit:

        try:
            pageSource = requests.get(url).text

            soup = BeautifulSoup(pageSource, "html.parser")
            
            authorList = soup.select("li.grid-result-item div.result-container")

            if len(authorList) == 0:
                break

            for author in authorList:

                try:
                    authorInfo = {}

                    authorInfo['picUrl'] = getProfileURLorNone(
                            author.select_one("img")['src']
                    )

                    if authorInfo['picUrl'] is not None:
                        authorInfo['picUrl'] = 'https://pureportal.coventry.ac.uk/' + authorInfo['picUrl'] 

                    name = author.select_one("a", attrs = { 'rel' : 'Person'})

                    authorInfo['name'] = name.text
                    authorInfo['profileLink'] = name['href']

                    dept = author.select_one(".relations.organisations a", 
                            attrs = { 'rel' : 'Organisation'})

                    authorInfo['department'] = dept.text
                    authorInfo['deptLink'] = dept['href']
                    
                    authors.append(authorInfo)
                except:
                    pass

            print(f"Finished {page} ")
            
            page += 1
            url = f"https://pureportal.coventry.ac.uk/en/persons/?format=&page={page}"
        except: 
            break

    return authors

In [10]:
authors = scrapeAuthors()

print(f"Scraped {len(authors)} authors")

Finished 1 
Finished 2 
Finished 3 
Finished 4 
Finished 5 
Finished 6 
Finished 7 
Finished 8 
Finished 9 
Finished 10 
Finished 11 
Finished 12 
Finished 13 
Finished 14 
Finished 15 
Finished 16 
Finished 17 
Finished 18 
Finished 19 
Finished 20 
Finished 21 
Finished 22 
Finished 23 
Finished 24 
Finished 25 
Finished 26 
Finished 27 
Finished 28 
Finished 29 
Finished 30 
Finished 31 
Finished 32 
Finished 33 
Finished 34 
Finished 35 
Finished 36 
Finished 37 
Finished 38 
Finished 39 
Finished 40 
Finished 41 
Finished 42 
Finished 43 
Finished 44 
Finished 45 
Finished 46 
Scraped 2021 authors


In [12]:
with open("authors.json", "w") as f:
    f.write(json.dumps(authors))

In [6]:
authorsDf = pd.read_json("./scrapedData/authors.json")
# papersDf

# export as csv
authorsDf.to_csv("./scrapedData/authors.csv", index=False)

## Insert into database
