In [9]:
import openpyxl as xl
import pprint as pp
import urllib3
from bs4 import BeautifulSoup
import requests
import numpy as np
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.chrome.options import Options as ChromeOptions
import datetime

In [10]:
#Load xlsx file into Python
wb = xl.load_workbook('input.xlsx') #Open Input workbook
journals = wb['journal_sheet'] #Open journals worksheet
urls = wb["url_sheet"]
num_rows = len(tuple(journals.rows))

#journal_sheet column map
#Col 0/A = journal name
#Col 1/B = Publisher
#Col 2/C = ISSN
#Col 3/D = EISSN
#Col 4/E = Country
#Col 5/F = Language
#Col 6/G = Category
#Col 7/H = Submission Guidelines URL

In [11]:
#Set up firefox and chrome drivers

def connectChrome():
    options = ChromeOptions()
    options.add_argument("--headless")
    chromeDriverPath = "chromedriver.exe"
    driver = webdriver.Chrome(chromeDriverPath, chrome_options=options)
    print("Chrome Headless Browser Invoked")
    return driver

def connectFirefox():
    options = FirefoxOptions()
    options.add_argument("--headless")
    driver = webdriver.Firefox(firefox_options=options)
    print("Firefox Headless Browser Invoked")
    return driver

In [12]:
def get_url(row, source):
    #Alias varibles for readability
    
    journal = row[0]
    publisher = row[1]
    issn = row[2]
    eissn = row[3]
    target = row[8]
    
    #Check for each attribute in URL and replace with appropriate variable
    
    if "J_U_NAME" in source:
        #replace J_U_NAME with lowercase journal name and replace space with an underscore
        temp_name = journal.value.lower().replace(" ","_")
        target = source.replace("J_U_NAME", temp_name)
        source = target
    if "JNAME" in source:
        #Replace JNAME with lowercase journal name and replace space with hyphen
        temp_name = journal.value.lower().replace(" ", "-") 
        target = source.replace("JNAME",temp_name)
        source = target
    if "EISSN" in source:
        #Replace EISSN with actual EISSN, replacing space with hyphen
        temp_name = eissn.value.replace("-","")
        target = source.replace("EISSN", temp_name)
        source = target
    if "E_H_SSN" in source:
        #Replace EISSN with actual EISSN, keeping hyphens
        temp_name = eissn.value
        target = source.replace("E_H_SSN", temp_name)
        source = target
    if "ISSN" in source:
        #Replace ISSN with actual ISSN, replacing space with hyphen
        temp_name = issn.value.replace("-","")
        target = source.replace("ISSN", temp_name)
        source = target
    if "I_H_SSN" in source:
        #Replace ISSN with actual ISSN, keeping hyphens
        temp_name = issn.value
        target = source.replace("I_H_SSN", temp_name)
        source = target
    return target

In [13]:
#Iterate over all rows, find URL
#TAKES VERY VERY LONG
home_page_url = "https://www.researchgate.net/journal/I_H_SSN_J_U_NAME"

delays = ((np.random.rand(3))+2) # create random delay times of 0 to 5 secs
check_point = 2
switch = 1

n_searches = 0
for row in journals.iter_rows(max_col=9, min_row=2, max_row=205):
    print("Starting Search for journal row item:" + str(check_point))
    
    #Check if submission page already found
    if(row[7].value != None):
        #if found, skip
        print("Skipping, already have submission page")
        check_point +=1
        continue
    
    #Check if homepage is already found
    if(row[8].value != None):
        #if found skip
        print("Skipping, previously searched homepage")
        check_point +=1
        continue
        
    #Save every 25 iterations and swap drivers
    if ((n_searches % 25) == 0):
        date = str(datetime.datetime.now()).replace(' ','')[0:18].replace(':','')
        wb.save('./logs/homepage/output_'+date+'checkpoint_'+str(check_point)+'.xlsx') #Save every 25 iterations
        #Swap and Init new Driver
        if switch == 0:
            try:
                driver.quit()
                driver = connectChrome()
            except:
                driver = connectChrome()
            switch = 1
        elif switch == 1:
            try:
                driver.quit()
                driver = connectFirefox()
            except:
                driver = connectFirefox()
            switch = 0
        
    #Delay by random amount of time
    delay = np.random.choice(delays)
    time.sleep(delay)
    
    #Open and Parse url
    driver.get(get_url(row, home_page_url))
    page = driver.page_source
    soup = BeautifulSoup(page, 'html.parser')
    n_searches +=1
    
    #Check if Page is on Website
    if(soup.find('h1').contents[0][0:17] == 'Directory results'): #if true, we have been redirected to Researchgate directory
        #This means the journal is not on researchgate
        error_message = "ERROR: Journal Not on Research Gate"
        row[8].value = error_message
        print(error_message)
        
    else:
        website_div = soup.find("table", {"class":"table journal-full-info__table"}).tbody.find_all("th")[4].contents[0]

        if website_div == 'Website':
            try:
                url = soup.find("a", {"class":"nova-e-link nova-e-link--color-blue nova-e-link--theme-bare"}).contents[0]
                row[8].value = url 
                print(url)
            except:
                error_message = "ERROR: No URL On Researchgate"
                row[8].value = error_message
                print(error_message)

    check_point +=1
            

Starting Search for journal row item:2
Skipping, previously searched homepage
Starting Search for journal row item:3
Skipping, previously searched homepage
Starting Search for journal row item:4
Skipping, previously searched homepage
Starting Search for journal row item:5
Skipping, previously searched homepage
Starting Search for journal row item:6
Skipping, previously searched homepage
Starting Search for journal row item:7
Skipping, previously searched homepage
Starting Search for journal row item:8
Skipping, previously searched homepage
Starting Search for journal row item:9
Skipping, previously searched homepage
Starting Search for journal row item:10
Skipping, previously searched homepage
Starting Search for journal row item:11
Skipping, previously searched homepage
Starting Search for journal row item:12
Skipping, previously searched homepage
Starting Search for journal row item:13
Skipping, previously searched homepage
Starting Search for journal row item:14
Skipping, previously

ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

In [8]:
try:
    driver.quit()
except:
    driver = connectChrome()
date = str(datetime.datetime.now()).replace(' ','')[0:18].replace(':','')
wb.save('./logs/homepage/output_'+date+'checkpoint_'+str(check_point)+'.xlsx')#save to homepage logs

##### 