In [19]:
import openpyxl as xl
import pprint as pp
import urllib3
from bs4 import BeautifulSoup
import requests
import numpy as np
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.chrome.options import Options as ChromeOptions
import datetime

In [20]:
#Load xlsx file into Python
wb = xl.load_workbook('input.xlsx') #Open Input workbook
journals = wb['journal_sheet'] #Open journals worksheet
urls = wb["url_sheet"]
num_rows = len(tuple(journals.rows))

#journal_sheet column map
#Col 0/A = journal name
#Col 1/B = Publisher
#Col 2/C = ISSN
#Col 3/D = EISSN
#Col 4/E = Country
#Col 5/F = Language
#Col 6/G = Category
#Col 7/H = Submission Guidelines URL

In [21]:
#Set up firefox and chrome drivers

def connectChrome():
    options = ChromeOptions()
    options.add_argument("--headless")
    chromeDriverPath = "chromedriver.exe"
    driver = webdriver.Chrome(chromeDriverPath, chrome_options=options)
    print("Chrome Headless Browser Invoked")
    return driver

def connectFirefox():
    options = FirefoxOptions()
    options.add_argument("--headless")
    driver = webdriver.Firefox(firefox_options=options)
    print("Firefox Headless Browser Invoked")
    return driver

In [22]:
def get_url(row, source):
    #Alias varibles for readability
    
    journal = row[0]
    publisher = row[1]
    issn = row[2]
    eissn = row[3]
    target = row[8]
    
    #Check for each attribute in URL and replace with appropriate variable
    
    if "J_U_NAME" in source:
        #replace J_U_NAME with lowercase journal name and replace space with an underscore
        temp_name = journal.value.lower().replace(" ","_")
        target = source.replace("J_U_NAME", temp_name)
        source = target
    if "JNAME" in source:
        #Replace JNAME with lowercase journal name and replace space with hyphen
        temp_name = journal.value.lower().replace(" ", "-") 
        target = source.replace("JNAME",temp_name)
        source = target
    if "EISSN" in source:
        #Replace EISSN with actual EISSN, replacing space with hyphen
        temp_name = eissn.value.replace("-","")
        target = source.replace("EISSN", temp_name)
        source = target
    if "E_H_SSN" in source:
        #Replace EISSN with actual EISSN, keeping hyphens
        temp_name = eissn.value
        target = source.replace("E_H_SSN", temp_name)
        source = target
    if "ISSN" in source:
        #Replace ISSN with actual ISSN, replacing space with hyphen
        temp_name = issn.value.replace("-","")
        target = source.replace("ISSN", temp_name)
        source = target
    if "I_H_SSN" in source:
        #Replace ISSN with actual ISSN, keeping hyphens
        temp_name = issn.value
        target = source.replace("I_H_SSN", temp_name)
        source = target
    return target

In [23]:
#Iterate over all rows, find URL
#TAKES VERY VERY LONG
home_page_url = "https://www.researchgate.net/journal/I_H_SSN_J_U_NAME"

delays = ((np.random.rand(5))) # create random delay times of 0 to 3 secs
check_point = 2
switch = 1

n_searches = 0
for row in journals.iter_rows(max_col=9, min_row=4018, max_row=num_rows):
    print("Starting Search for journal row item:" + str(check_point))
    
    #Check if submission page already found
    if(row[7].value != None):
        #if found, skip
        print("Skipping, already have submission page")
        check_point +=1
        continue
    
    #Check if homepage is already found
    if(row[8].value != None):
        #if found skip
        print("Skipping, previously searched homepage")
        check_point +=1
        continue
        
    #Save every 25 iterations and swap drivers
    if ((n_searches % 100) == 0):
        date = str(datetime.datetime.now()).replace(' ','')[0:18].replace(':','')
        wb.save('./logs/homepage/output_'+date+'checkpoint_'+str(check_point)+'.xlsx') #Save every 25 iterations
        #Swap and Init new Driver
        if switch == 0:
            try:
                driver.quit()
                driver = connectChrome()
            except:
                driver = connectChrome()
            switch = 1
        elif switch == 1:
            try:
                driver.quit()
                driver = connectFirefox()
            except:
                driver = connectFirefox()
            switch = 0
        
    #Delay by random amount of time
    delay = np.random.choice(delays)
    time.sleep(delay)
    
    #Open and Parse url
    driver.get(get_url(row, home_page_url))
    page = driver.page_source
    soup = BeautifulSoup(page, 'html.parser')
    n_searches +=1
    
    #Check if Page is on Website
    if(soup.find('h1').contents[0][0:17] == 'Directory results'): #if true, we have been redirected to Researchgate directory
        #This means the journal is not on researchgate
        error_message = "ERROR: Journal Not on Research Gate"
        row[8].value = error_message
        print(error_message)
        
    else:
        website_div = soup.find("table", {"class":"table journal-full-info__table"}).tbody.find_all("th")[4].contents[0]

        if website_div == 'Website':
            try:
                url = soup.find("a", {"class":"nova-e-link nova-e-link--color-blue nova-e-link--theme-bare"}).contents[0]
                row[8].value = url 
                print(url)
            except:
                error_message = "ERROR: No URL On Researchgate"
                row[8].value = error_message
                print(error_message)

    check_point +=1
            

Starting Search for journal row item:2
Firefox Headless Browser Invoked
http://www.interscience.wiley.com/jpages/0161-4681/
Starting Search for journal row item:3
ERROR: No URL On Researchgate
Starting Search for journal row item:4
http://www.telospress.com/telosintro.htm
Starting Search for journal row item:5
http://www.telospress.com/telosintro.htm
Starting Search for journal row item:6
ERROR: No URL On Researchgate
Starting Search for journal row item:7
http://www.gallimard.fr/catalog/html/revue/temp.htm
Starting Search for journal row item:8
http://www.utexas.edu/law/journals/tlr/
Starting Search for journal row item:9
ERROR: Journal Not on Research Gate
Starting Search for journal row item:10
ERROR: Journal Not on Research Gate
Starting Search for journal row item:11
ERROR: Journal Not on Research Gate
Starting Search for journal row item:12
http://jep.textrum.com/
Starting Search for journal row item:13
ERROR: No URL On Researchgate
Starting Search for journal row item:14
ERROR: 

http://www.journals.uchicago.edu/openurl?genre=journal&stitle=esj
Starting Search for journal row item:108
http://www.journals.uchicago.edu/openurl?genre=journal&stitle=et
Starting Search for journal row item:109
http://www.journals.uchicago.edu/openurl?genre=journal&stitle=ijal
Starting Search for journal row item:110
http://www.journals.uchicago.edu/openurl?genre=journal&stitle=isis
Starting Search for journal row item:111
http://www.unm.edu/~jar/abs1.html
Starting Search for journal row item:112
ERROR: No URL On Researchgate
Starting Search for journal row item:113
http://www.journals.uchicago.edu/openurl?genre=journal&stitle=jole
Starting Search for journal row item:114
http://www.journals.uchicago.edu/openurl?genre=journal&stitle=jle
Starting Search for journal row item:115
http://www.journals.uchicago.edu/openurl?genre=journal&stitle=jls
Starting Search for journal row item:116
http://www.journals.uchicago.edu/openurl?genre=journal&stitle=jmh
Starting Search for journal row item:

ERROR: No URL On Researchgate
Starting Search for journal row item:203
http://www.rpd-online.com/
Starting Search for journal row item:204
http://home.law.uiuc.edu/lrev/
Starting Search for journal row item:205
http://www.press.uillinois.edu/journals/ajp.html
Starting Search for journal row item:206
ERROR: Journal Not on Research Gate
Starting Search for journal row item:207
http://www.press.uillinois.edu/journals/am.html
Starting Search for journal row item:208
http://people.cohums.ohio-state.edu/tennant9/apq.html
Starting Search for journal row item:209
ERROR: No URL On Researchgate
Starting Search for journal row item:210
http://www.press.uillinois.edu/journals/jae.html
Starting Search for journal row item:211
http://transactionpub.metapress.com/openurl.asp?genre=journal&issn=0278-5927
Starting Search for journal row item:212
ERROR: No URL On Researchgate
Starting Search for journal row item:213
ERROR: No URL On Researchgate
Starting Search for journal row item:214
ERROR: Journal No

http://law.usc.edu/students/orgs/lawreview/index.cfm
Starting Search for journal row item:312
ERROR: No URL On Researchgate
Starting Search for journal row item:313
http://www.jtaer.com/
Starting Search for journal row item:314
ERROR: Journal Not on Research Gate
Starting Search for journal row item:315
ERROR: No URL On Researchgate
Starting Search for journal row item:316
ERROR: Journal Not on Research Gate
Starting Search for journal row item:317
http://muse.jhu.edu/journals/sex/
Starting Search for journal row item:318
http://muse.jhu.edu/journals/cj/
Starting Search for journal row item:319
ERROR: Journal Not on Research Gate
Starting Search for journal row item:320
http://muse.jhu.edu/journals/sex/
Starting Search for journal row item:321
http://muse.jhu.edu/journals/lat/
Starting Search for journal row item:322
ERROR: No URL On Researchgate
Starting Search for journal row item:323
http://muse.jhu.edu/journals/tsl/
Starting Search for journal row item:324
http://www.utpjournals.co

http://www.vsjournals.de/index.php;do=viewmag/site=zfew/lng=de/area=pad/id=8/alloc=151/
Starting Search for journal row item:415
ERROR: No URL On Researchgate
Starting Search for journal row item:416
ERROR: No URL On Researchgate
Starting Search for journal row item:417
http://www.vse.cz/polek/
Starting Search for journal row item:418
http://www.sciencedirect.com/science/journal/00333506
Starting Search for journal row item:419
http://www.sciencedirect.com/science/journal/08971897
Starting Search for journal row item:420
http://www.sciencedirect.com/science/journal/08839417
Starting Search for journal row item:421
http://www.us.elsevierhealth.com/product.jsp?isbn=10564993
Starting Search for journal row item:422
http://www.sciencedirect.com/science/journal/0010440X
Starting Search for journal row item:423
http://www.us.elsevierhealth.com/product.jsp?isbn=08995885
Starting Search for journal row item:424
http://www.sciencedirect.com/science/journal/87557223
Starting Search for journal r

ERROR: No URL On Researchgate
Starting Search for journal row item:1005
ERROR: No URL On Researchgate
Starting Search for journal row item:1006
http://www.ajol.info/journal_index.php?ab=ajrh
Starting Search for journal row item:1007
ERROR: No URL On Researchgate
Starting Search for journal row item:1008
http://ejournals.worldscientific.com.sg/ser/ser.shtml
Starting Search for journal row item:1009
ERROR: No URL On Researchgate
Starting Search for journal row item:1010
ERROR: No URL On Researchgate
Starting Search for journal row item:1011
ERROR: No URL On Researchgate
Starting Search for journal row item:1012
http://versita.metapress.com/openurl.asp?genre=journal&issn=1232-8855
Starting Search for journal row item:1013
ERROR: No URL On Researchgate
Starting Search for journal row item:1014
http://yalepress.yale.edu/yupbooks/SeriesPage.asp?Series=99
Starting Search for journal row item:1015
http://www.yalelawjournal.org/
Starting Search for journal row item:1016
ERROR: No URL On Researc

In [25]:
try:
    driver.quit()
except:
    driver = connectChrome()
date = str(datetime.datetime.now()).replace(' ','')[0:18].replace(':','')
wb.save('input.xlsx')
wb.save('./logs/homepage/output_'+date+'checkpoint_'+str(check_point)+'.xlsx')#save to homepage logs

##### 