# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Requirements

In [None]:
!pip install selenium
!apt-get update 
!apt install chromium-chromedriver
%cd /content/drive/MyDrive/MIR-Crawler/
!pwd

Collecting selenium
[?25l  Downloading https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl (904kB)
[K     |████████████████████████████████| 911kB 8.0MB/s 
Installing collected packages: selenium
Successfully installed selenium-3.141.0
Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:7 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:8 http://archive.ubuntu.com/u

# Imports

In [None]:
import time
import sys
import json

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException  

# Functions

**Fetch Website**

In [None]:
def fetchWebsite(driver, address):
	# paper format dictionart
	paper = {'id': '', 'title': '', 'abstract': '', 'date': '', 'authors': [], 'references': [] }
	# fetch website
	driver.get(address)
	# respect hit rate
	# wait = WebDriverWait(driver, 10)
	time.sleep(2)
	# check if blocked
	if driver.page_source == '<html><head></head><body>The request is blocked</body></html>':
		print('Error: ACCESS DENIED!')
	try:
		# doi of paper
		path = "//body"
		# doi = wait.until(EC.visibility_of_element_located((By.XPATH, path))).get_attribute('originalurl')
		doi = driver.find_element_by_xpath(path).get_attribute('originalurl').split('/')[4]
		# print('doi found')
		paper['id'] = doi

		# title
		path = "//div[@class='name-section']/h1[@class='name']"
		# title = wait.until(EC.visibility_of_element_located((By.XPATH, path))).text
		title = driver.find_element_by_xpath(path).text
		# print('title found')
		paper['title'] = title

		# abstract
		path = "//div[@class='name-section']/p"
		# abstract = wait.until(EC.visibility_of_element_located((By.XPATH, path))).text
		abstract = driver.find_element_by_xpath(path).text
		# print('abstract found')
		paper['abstract'] = abstract

		# date published
		path = "//div[@class='name-section']/a[@class='au-target publication']/span[@class='year']"
		# date = wait.until(EC.visibility_of_element_located((By.XPATH, path))).text
		date = driver.find_element_by_xpath(path).text
		# print('date found')
		paper['date'] = date

		# authors list
		path = "//div[@class='name-section']/ma-author-string-collection/*/div[@class='authors']"
		# authorsList = wait.until(EC.visibility_of_element_located((By.XPATH, path)))
		authorsList = driver.find_element_by_xpath(path)
		# print('authorsList found')
		authors = []
		while(True):
			try:
				path = "div[" + str(len(authors) + 1) + "]/a[@class='au-target author link']"
				# author = WebDriverWait(authorsList, 10).until(EC.visibility_of_element_located((By.XPATH, path)))
				author = authorsList.find_element_by_xpath(path)
				# print('author number' + str(len(authors) + 1) + ' found')
				authors.append(author.text)
			except NoSuchElementException: # end of authors
				break
		paper['authors'] = authors

		# references list
		path = "//div[@class='ma-paper-results']/div[@class='results']"
		# refsList = wait.until(EC.visibility_of_element_located((By.XPATH, path)))
		refsList = driver.find_element_by_xpath(path)
		# print('refsList found')
		refs = []
		while(True and len(refs) < 10):
			try:
				path = "ma-card[" + str(len(refs) + 1) + "]/div/compose/div/div[@class='primary_paper']/a"
				# ref = WebDriverWait(refsList, 10).until(EC.visibility_of_element_located((By.XPATH, path)))
				ref = refsList.find_element_by_xpath(path)
				# print('reference number' + str(len(refs) + 1) + ' found')
				refs.append(ref.get_attribute('href').split('/')[4])
			except NoSuchElementException: # end of refs
				break
		paper['references'] = refs
	except NoSuchElementException:
		print('Error: ELEMENTS NOT FOUND! for paper ' + address.split('/')[4])
		paper['id'] = '0'
	return paper


**Initialize Web Driver**

In [None]:
def init():
	# firefox driver options
	opt = Options()
	opt.headless = True # hidden browser
	opt.add_argument('--disable-gpu') # disable graphics
	opt.add_argument("--window-size=1920,1200") # window size of the browser
	opt.add_argument('--no-sandbox')
	opt.add_argument('--disable-dev-shm-usage')
	opt.add_argument("user-agent=Three Musketeers") # change user agent
	opt.add_argument('log-level=3') # only log fatal errors
	# set the web driver
	driver = webdriver.Chrome(options=opt, executable_path='chromedriver')
	return driver

**Crawler Thread**

In [None]:
def reptile(starter, LIMIT):
	# get crawling queue
	queue = []
	with open(starter, 'r', encoding = 'utf-8') as f:
		lines = f.readlines()
		for l in lines:
			queue.append(l[0:-1].split('/')[4])
		f.close()
	count = len(queue)
	# print(queue)

	# initialize driver
	driver = init()
	print('\n******** Crawling Initiated ********\n')

	# create database crawling list
	db = []
	fetchedPapers = []

	# crawling loop
	while len(fetchedPapers) < LIMIT:
		id = queue[0] # first paper of the queue
		queue = queue[1:] # update queue
		if id in fetchedPapers: # fetched before
			continue
		# new paper
		address = "https://academic.microsoft.com/paper/" + id # convert paper id to url
		paper = fetchWebsite(driver, address) # fetch website
		print('{}\t{}\tfetched'.format(len(fetchedPapers),id))
		if(paper['id'] == '0'): # unsuccessful fetch
			continue
			# driver.quit() # quit current driver
			# driver = init() # initialize another driver
			# paper = fetchWebsite(driver, address) # fetch website again
			# print(str(id) + ' fetched')
		db.append(paper) # add paper to the database
		fetchedPapers.append(id) # just fetched
		# add references to the queue
		for ref in paper['references']:
			queue.append(ref)
	# terminate the driver
	driver.quit()
	print('\n******** Crawling Terminated ********\n')
	# save database to local file
	with open('database.json', 'w', encoding = 'utf-8') as f:
		json.dump(db, f)
		f.close

# Run Program

In [None]:
reptile('start.txt', 5000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Error: ELEMENTS NOT FOUND! for paper 1569415500
2297	1569415500	fetched
2297	2136480620	fetched
2298	2154652894	fetched
Error: ELEMENTS NOT FOUND! for paper 2134061542
2299	2134061542	fetched
Error: ELEMENTS NOT FOUND! for paper 2140424253
2299	2140424253	fetched
2299	3018757597	fetched
2300	3034971973	fetched
2301	3035160371	fetched
2302	2994749257	fetched
2303	3035743198	fetched
2304	3034429256	fetched
2305	2883780447	fetched
Error: ELEMENTS NOT FOUND! for paper 3111681398
2306	3111681398	fetched
2306	2949736877	fetched
2307	2963136578	fetched
2308	2736601468	fetched
Error: ELEMENTS NOT FOUND! for paper 2963125010
2309	2963125010	fetched
2309	2963857521	fetched
2310	2963542245	fetched
2311	2964082701	fetched
2312	2559085405	fetched
2313	2056250865	fetched
2314	2113691817	fetched
2315	1924403233	fetched
2316	2080018251	fetched
2317	2155280192	fetched
2318	2143866356	fetched
2319	2116625254	fetched
Error: ELEMENTS NOT FOU