<a href="https://colab.research.google.com/github/cazdemun/colab-notebooks/blob/main/Inspiration_Text_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web scrapping

## Selenium Installation

In [2]:
import subprocess

def setup_selenium():
  subprocess.call(['wget', 'https://bit.ly/3fk4vlc', '-O', 'bash.sh'])
  subprocess.call(['sh', 'bash.sh'])

setup_selenium()

## Selenium configuration

In [3]:
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pprint

# Standar webdriver options
# https://stackoverflow.com/questions/51046454/how-can-we-use-selenium-webdriver-in-colab-research-google-com
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Cloudflare bypass (incomplete or doesn't work)
# https://blog.m157q.tw/posts/2020/09/11/bypass-cloudflare-detection-while-using-selenium-with-chromedriver/
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument("--disable-blink-features=AutomationControlled")

# Headless anti-bot detection
# https://intoli.com/blog/making-chrome-headless-undetectable/
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')

pp = pprint.PrettyPrinter(indent=4)

def print_body(wd):
  """
  f: webdriver -> ()
  """
  body_text = wd.find_element(By.TAG_NAME ,"body").text
  pp.pprint(body_text)

def scopus_signin(wd):
  """
  f: webdriver -> webdriver
  Returns a scopus homepage signed webdriver
  Note: it seems that the web driver keep the credentials after login, this can be useful in a UI
  """
  # home page
  wd.get("https://www.scopus.com")
  signin_button = wd.find_element(By.ID ,"signin_link_move")
  signin_button.click()

  # sign in
  # screen 1 - email
  email_input = wd.find_element(By.ID ,"bdd-email")
  email_input.send_keys("u201512699@upc.edu.pe")

  continue_button = wd.find_element(By.ID ,"bdd-elsPrimaryBtn")
  continue_button.click()

  # screen 2 - password
  password_input = wd.find_element(By.ID ,"bdd-password")
  password_input.send_keys("4De#4Dmf")

  submit_button = wd.find_element(By.ID ,"bdd-elsPrimaryBtn")
  submit_button.click()

  return wd

In [4]:
wd = scopus_signin(webdriver.Chrome('chromedriver',options=chrome_options))
wd.current_url

# OOP WebDriver

In [30]:
from typing import List
import requests

class SearchResult:
  def __init__(self, title, abstract):
    self.title: str = title
    self.abstract: str = abstract
  def __str__(self):
    return f"""
    Title: {self.title}
    Abstract: {self.abstract}
    """

class ScopusWebDriver:
  def __init__(self, wd):
    # State Machine: search.page | search.results.page
    self.value: str = "search.page" if wd.current_url.startswith('https://www.scopus.com/search/form.uri') else "search.results.page"
    self.wd = wd
    # Search
    self.current_query: str = "" if wd.current_url.startswith('https://www.scopus.com/search/form.uri') else wd.find_element(By.CLASS_NAME ,"queryContainer").text
    self.current_display: int= 20 # 20 | 50 | 100 | 200
    self.current_page: int= 1
    self.total_results: int= 0
    self.search_results: List[SearchResult] = []
    # self.show_all_abstracts:Bool = False, deprecated?
  
  def __str__(self):
      return f"""
ScopusWebDriver State:
  Machine State: "{self.value}"
  WebDriver object: {self.wd}
  Query: {self.current_query}
  Display: {self.current_display}
  Page: {self.current_page}
  Total Results: {self.total_results}
  Results: {self.search_results}
      """
  def _search_query(self, query: str) -> str:
    if not self.value == "search.page":
      print("Navigation failed, WebDriver should be on the \"search.page\" state")
      return ""

    search_input = self.wd.find_element(By.ID ,"searchterm1")
    
    search_input.send_keys(query)

    search_button_row = self.wd.find_element(By.ID ,"searchBtnRow")
    search_button = search_button_row.find_element(By.XPATH, '//button[@title="Search"]')
    search_button.click()

    self.value = "search.results.page"
    return self.wd.find_element(By.CLASS_NAME ,"queryContainer").text

  def return_to_search_page(self) -> bool:
    if not self.value == "search.results.page":
      print("Navigation failed, WebDriver should be on the \"search.results.page\" state")
      return False

    search_tab = self.wd.find_element(By.ID ,"gh-Search")
    search_tab.click()

    search_input = self.wd.find_element(By.ID ,"searchterm1")
    search_input.send_keys(Keys.CONTROL, 'a')
    search_input.send_keys(Keys.BACKSPACE)

    self.value = "search.page" 
    self.current_query = "" 
    return True

  def search(self, query: str, max: int) -> List[SearchResult]:
    if not self.value == "search.page":
      print("Search failed, WebDriver should be on the \"search.page\" state")
      return []

    self.current_query = self._search_query(query)
    self.total_results = (int)(wd.find_element(By.CLASS_NAME ,'resultsCount').text.replace(',', ''))
    
    # build abstract request
    # pp.pprint(self.wd.get_cookies())
    cookies_string = "; ".join([f"{c['name']}={c['value']}" for c in self.wd.get_cookies()])
    print(cookies_string)
    
    headers={
        "Cookie": cookies_string,
        "Referer": wd.current_url
    }
    
    # print("https://www.scopus.com/results/showallabstractspreview.uri")

    data = {
      "count": "200",
      "offset": "0",
      "searchString": "TITLE-ABS-KEY(proteins)",
      "sortType": "r-f",
      "origin": "resultslist",
      "originationType": "b",
      "clusterRowDisplayCount": "10",
      "resultsPerPage": "200",
      "resultsCount": "6,193,247",
      "eid": "null",
      "listId": "",
      "cite": "",
      "citedAuthorId": "",
      "citingId": "",
      "refeid": "",
      "ref": "",
      "recent": "",
      "recordIndex": "null"
    }

    r = requests.post("https://www.scopus.com/results/showallabstractspreview.uri", headers=headers, json=data)
    print(r.headers)
    print(r.text)
    return []

scopusWD = ScopusWebDriver(wd)
print(scopusWD.wd.current_url, scopusWD)

https://www.scopus.com/results/results.uri?numberOfFields=0&src=s&clickedLink=&edit=&editSaveSearch=&origin=searchbasic&authorTab=&affiliationTab=&advancedTab=&scint=1&menu=search&tablin=&searchterm1=protein&field1=TITLE_ABS_KEY&dateType=Publication_Date_Type&yearFrom=Before+1960&yearTo=Present&loadDate=7&documenttype=All&accessTypes=All&resetFormLink=&st1=protein&st2=&sot=b&sdt=b&sl=22&s=TITLE-ABS-KEY%28protein%29&sid=0db68610bac03448545772800be5a6c0&searchId=0db68610bac03448545772800be5a6c0&txGid=5f28e97e8a3c7c1be422774218458dd5&sort=r-f&originationType=b&rr= 
ScopusWebDriver State:
  Machine State: "search.results.page"
  WebDriver object: <selenium.webdriver.chrome.webdriver.WebDriver (session="b99ddf7c60830b1e00a67f839121472b")>
  Query: TITLE-ABS-KEY ( protein ) 
  Display: 20
  Page: 1
  Total Results: 0
  Results: []
      


In [32]:
scopusWD.search("protein", 1000)
print(scopusWD.wd.current_url, scopusWD)

scopusWD.return_to_search_page()
print(scopusWD.wd.current_url, scopusWD)

s_sess=%20s_cpc%3D0%3B%20s_ppvl%3Dsc%25253Asearch%25253Adocument%252520searchform%252C40%252C40%252C600%252C800%252C600%252C800%252C600%252C1%252CP%3B%20s_ppv%3Dsc%25253Asearch%25253Adocument%252520results%252C2%252C2%252C600%252C800%252C600%252C800%252C600%252C1%252CP%3B%20c21%3Dtitle-abs-key%2528protein%2529%3B%20e13%3Dtitle-abs-key%2528protein%2529%253A1%3B%20c13%3Drelevance%3B%20e41%3D1%3B%20s_sq%3D%3B%20s_cc%3Dtrue%3B; AWSELB=CB9317D502BF07938DE10C841E762B7A33C19AADB1328F3C3DB5264F4CFEF1A02C9C063CCB3841E23B31F6C6F833247060CD6C008CA31AAC5A6BDE3E4B4DACF34F3854CEEB689045CC638AEB49B8B87EE8F8FC1CB9; AMCV_4D6368F454EC41940A4C98A6%40AdobeOrg=359503849%7CMCIDTS%7C18599%7CMCMID%7C88508890635440160581675557641314175166%7CMCAID%7CNONE%7CMCOPTOUT-1606917923s%7CNONE%7CMCAAMLH-1607515523%7C7%7CMCAAMB-1607515523%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCSYNCSOP%7C411-18606%7CMCCIDH%7C1863782%7CvVersion%7C5.0.1; s_pers=%20v8%3D1606913547116%7C1701521547116%3B%20v8_s%3DFirst%2520Visit%

In [11]:
scopusWD.return_to_search_page()
print(scopusWD.wd.current_url, scopusWD)

https://www.scopus.com/search/form.uri?zone=TopNavBar&origin=resultslist&display=basic 
ScopusWebDriver State:
  Machine State: "search.page"
  WebDriver object: <selenium.webdriver.chrome.webdriver.WebDriver (session="b99ddf7c60830b1e00a67f839121472b")>
  Query: 
  Display: 20
  Page: 1
  Total Results: 6193247
  Results: []
      


## Declarative bot behavior

This machine just describes the bot data-collecting behaviour, it does not process nor analize data.

In [None]:
def scopusMachine(wd=None):
  return {
    "initial": "search.page",
    "context": {
        "wb": wd,
        "current_query": "",
        "current_display": "20",
        "search_results": [] # raw html lement
        # show_abstracts: false
        # page: 1
        # total_pages: total results element / current_display
    },
    "states": {
      "search.page": {
          "on" : {
            "SEARCH": {
                "target": "search.results.page",
                "actions": ["wd_search_query(context.wd)", "update_current_query(context.current_query)"]
            }    
          }
      },
      "search.results.page": {
          "on": {
              "RETURN.TO.SEARCH": {
                "target": "search.page",
                "actions": ["wd_return_to_search(context.wd)", "delete_current_query(context.current_query), "wd_extract_raw_results(context.wd)"]
              },
              "EXTRACT.RESULTS": {
                "target": "search.page",
                "actions": ["wd_extract_raw_results(context.wd)"]
              },
          }
      }
    }
  }

# send(machine, event, data)

In [None]:
wd = webdriver.Chrome('chromedriver',options=chrome_options)
signed_wd = scopus_signin(wd)

# # home_signed_text
# print_body(signed_wd)

state = {
  "value": "search.page", # search.results.page
  "wd": signed_wd,
  # search state
  "current_query": "",
  "search_results": [],
  "results_count": 0,
  "show_all_abstracts": False,
  "current_display": 20, # 50, 100, 200
}

state

{'current_display': 20,
 'current_query': '',
 'results_count': 0,
 'search_results': [],
 'show_all_abstracts': False,
 'value': 'search.page',
 'wd': <selenium.webdriver.chrome.webdriver.WebDriver (session="50eaead24443124c0da5e767add03d1f")>}

In [None]:
def wd_return_to_search(wd):
  """
  transition: search.results.page -> search.page
  """
  search_tab = wd.find_element(By.ID ,"gh-Search")
  search_tab.click()

  # erase current input
  search_input = wd.find_element(By.ID ,"searchterm1")
  search_input.send_keys(Keys.CONTROL, 'a')
  search_input.send_keys(Keys.BACKSPACE)
  return wd

def wd_search_query(query, wd):
  """
  transition: search.page -> search.results.page
  """
  search_input = wd.find_element(By.ID ,"searchterm1")
  
  search_input.send_keys(query)

  search_button_row = wd.find_element(By.ID ,"searchBtnRow")
  search_button = search_button_row.find_element(By.XPATH, '//button[@title="Search"]')
  search_button.click()

  return wd

################################################################################ 

def wd_extract_results_count(wd):
  """
  transition: search.results.page -> search.results.page
  """
  results_count = (int)(wd.find_element(By.CLASS_NAME ,'resultsCount').text.replace(',', ''))
  return results_count

def wd_show_all_abstracts(wd):
  """
  transition: search.results.page -> search.results.page
  """
  show_abstracts_link = wd.find_element(By.ID ,"previewAllAbstractsLinkText")
  show_abstracts_link.click()

  return True

def wd_extract_raw_results(results_count, current_display, wd):
  """
  transition: search.results.page -> search.results.page
  """

  # ws_change_display(200)
  select_display = wd.find_element(By.ID ,"resultsPerPage-button")
  select_display.click()

  pages_200 = wd.find_element(By.ID ,"ui-id-4")
  pages_200.click()

  WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.ID, "previewAllAbstractsLinkText")))

  # ws_show_all_abstracts
  show_abstracts_link = wd.find_element(By.ID ,"previewAllAbstractsLinkText")
  if "Hide" in show_abstracts_link.text:
    print("Don't click!")
    print(show_abstracts_link.text)
  if "Show" in show_abstracts_link.text:
    print("Click!")
    print(show_abstracts_link.text)
    show_abstracts_link.click()

  # WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.ID, "previewAbstract1")))
  WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.ID, "previewAbstract200")))

  #this does not have support for pages yet
  results = [] 
  # for i in range(0, min(results_count, current_display)):
  for i in range(0, min(results_count, 200)):
    row_title = wd.find_element(By.ID , f'resultDataRow{i}').text
    row_abstract = wd.find_element(By.ID , f'previewAbstract{i + 1}').text
    results.append({
        "title": row_title,
        "abstract": row_abstract
    })

  return results


In [None]:
state["current_query"] = "protein"
state["wd"] = wd_search_query(state["current_query"], state["wd"])
state["value"] = "search.results.page"
state["show_all_abstracts"] = wd_show_all_abstracts(state["wd"])
state["results_count"] = wd_extract_results_count(state["wd"])
state["search_results"] = wd_extract_raw_results(state["results_count"], state["current_display"], state["wd"])
# display(state)

# functional properties
# viscosity
# milk
# soy
# SEARCH RESULTS SIZE - 200
# NEXT PAGE

state["current_query"] = ""
state["wd"] = wd_return_to_search(state["wd"])
state["value"] = "search.page"
display(state)

Don't click!
Hide all abstracts


{'current_display': 20,
 'current_query': '',
 'results_count': 6189842,
 'search_results': [{'abstract': 'Copyright © 2015 by The American Society for Pharmacology and Experimental Therapeutics. G protein-coupled receptors (GPCRs) contribute to the regulation of every aspect of human physiology and are therapeutic targets for the treatment of numerous diseases. As a consequence, understanding the myriad of mechanisms controlling GPCR signaling and trafficking is essential for the development of new pharmacological strategies for the treatment of human pathologies. Of the many GPCR-interacting proteins, postsynaptic density protein of 95 kilodaltons, disc large, zona occludens-1 (PDZ) domain-containing proteins appear most abundant and have similarly been implicated in disease mechanisms. PDZ proteins play an important role in regulating receptor and channel protein localization within synapses and tight junctions and function to scaffold intracellular signaling protein complexes. In t

In [None]:
import re

state["search_results"]

"""
The first character is a numeral in the range 1-9, while the last three characters can be either numerals (in the range 0-9) or letters (in the range A-Z in the Latin alphabet). 
Plans for an expanded identification code system that handle more entries have been announced. 
"""
for i, s in enumerate(state["search_results"]):
  print(i)
  # x = re.search("[0-9][0-9A-Za-z]{3}", s['abstract']) 
  # x = re.search("protein[A-Za-z]", s['abstract']) 
  matches = re.findall('[()\[\]{};,. ][0-9][0-9A-Za-z]{3}[()\[\]{};,. ]', s['abstract'], re.DOTALL)
  print(matches)
  matches_1 = re.findall('[0-9][0-9A-Za-z]{3}', s['abstract'], re.DOTALL)
  print(matches_1)


0
[' 2015 ']
['2015']
1
[' 2011 ', ' 2011 ']
['2011', '2011']
2
[' 2006,']
['2006']
3
[' 2016 ']
['2016']
4
[]
[]
5
[' 2008 ']
['2008']
6
[' 2007 ']
['2007']
7
[' 2015 ']
['2015']
8
[' 2010 ']
['2010']
9
[]
[]
10
[' 2013 ']
['2013']
11
[]
[]
12
[]
[]
13
[' 2017 ', ' 2017;', ' 2016 ']
['2017', '2017', '2016']
14
[' 2006 ']
['2006']
15
[' 2007 ']
['2007']
16
[]
['4833', '418L', '03Ri', '9D61', '1689', '8WUF', '1046']
17
[' 2013 ']
['2013']
18
[]
[]
19
[' 2014 ']
['2014']
20
[' 2006 ']
['2006']
21
[' 2011 ']
['2011']
22
[' 2018 ', ' 1012 ']
['2018', '1012']
23
[' 2015 ']
['2015']
24
[' 2013 ']
['2013']
25
[' 2008 ']
['2008']
26
[' 2002 ']
['2002']
27
[]
[]
28
[' 2006 ']
['100A', '2006']
29
[' 2020 ']
['2020']
30
[' 2002 ']
['2002']
31
[' 2013 ']
['1750', '1752', '1750', '1752', '2013']
32
[' 2004 ']
['2004']
33
[' 2018,']
['2018']
34
[' 2012 ']
['3K9M', '3K4M', '2012']
35
[' 2009 ']
['2009']
36
[' 2013 ']
['2013']
37
[' 2013 ']
['2013']
38
[' 2010 ', ' 2010 ']
['2010', '2010']
39
[' 1500 

In [None]:
state["current_query"] = ""
state["wd"] = wd_return_to_search(state["wd"])
state["value"] = "search.page"
display(state)

In [None]:
# state["current_query"] = "functional properties"
# state["wd"] = wd_search_query(state["current_query"], state["wd"])
# state["value"] = "search.results.page"
# state["search_results"] = wd_extract_raw_results(state["wd"])
# display(state)

# state["current_query"] = ""
# state["wd"] = wd_return_to_search(state["wd"])
# state["value"] = "search.page"
# display(state)

In [None]:
# def wd_return_to_search(wd):
#   """
#   transition: search.results.page -> search.page
#   """
#   search_tab = wd.find_element(By.ID ,"gh-Search")
#   search_tab.click()

#   # search_input = wd.find_element(By.ID ,"searchterm1")
  
#   # # erase current input
#   # search_input.send_keys(Keys.CONTROL, 'a')
#   # search_input.send_keys(Keys.BACKSPACE)
#   return wd

# def wd_search_query(query, wd):
#   """
#   transition: search.page -> search.results.page
#   """
#   search_input = wd.find_element(By.ID ,"searchterm1")
  
#   # erase current input
#   search_input.send_keys(Keys.CONTROL, 'a')
#   search_input.send_keys(Keys.BACKSPACE)

#   search_input.send_keys(query)

#   search_button_row = wd.find_element(By.ID ,"searchBtnRow")
#   search_button = search_button_row.find_element(By.XPATH, '//button[@title="Search"]')
#   search_button.click()

#   # print_body(wd)
#   return wd

# def wd_extract_results_count(wd):
#   results_count = (int)(wd.find_element(By.CLASS_NAME ,'resultsCount').text.replace(',', ''))
#   return results_count

# def wd_extract_raw_results(results_count, current_display, wd):
#   """
#   transition: search.results.page -> search.results.page
#   """

#   # this should be a separated event
#   show_abstracts_link = wd.find_element(By.ID ,"previewAllAbstractsLinkText")
#   show_abstracts_link.click()
#   # print("show_abstracts_link", show_abstracts_link.text)
#   # print("show_abstracts_link", show_abstracts_link.tag_name)
  
#   # show_abstracts_button = wd.find_element(By.ID ,"previewAllAbstractsLinkText").find_element(By.XPATH, '//a[@class="secondaryLink"]')
#   # print("show_abstracts_button", show_abstracts_button.text)
#   # print("show_abstracts_button", show_abstracts_button.tag_name)
#   # show_abstracts_button.click()

#   WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.ID, "previewAbstract1")))

#   #this does not have support for pages yet
#   results = [] 
#   for i in range(0, min(results_count, current_display)):
#     row_title = wd.find_element(By.ID , f'resultDataRow{i}').text
#     # print(f'previewAbstract{i + 1}')
#     row_abstract = wd.find_element(By.ID , f'previewAbstract{i + 1}').text
#     results.append({
#         "title": row_title,
#         "abstract": row_abstract
#     })

#   # results = wd.find_element(By.ID ,"srchResultsList")
#   # results_list = results.find_elements(By.TAG_NAME ,"tr")
#   # for r in results_list:
#   #   pp.pprint(r)
#   #   pp.pprint(r.text)

#   return results

Failed experiment with post request to get abstracts

In [None]:
  # def search(self, query: str, max: int) -> List[SearchResult]:
  #   if not self.value == "search.page":
  #     print("Search failed, WebDriver should be on the \"search.page\" state")
  #     return []

  #   self.current_query = self._search_query(query)
  #   self.total_results = (int)(wd.find_element(By.CLASS_NAME ,'resultsCount').text.replace(',', ''))
    
  #   # build abstract request
  #   # pp.pprint(self.wd.get_cookies())
  #   cookies_string = "; ".join([f"{c['name']}={c['value']}" for c in self.wd.get_cookies()])
  #   print(cookies_string)
    
  #   headers={
  #       "Cookie": cookies_string,
  #       "Referer": wd.current_url
  #   }
    
  #   # print("https://www.scopus.com/results/showallabstractspreview.uri")

  #   data = {
  #     "count": "200",
  #     "offset": "0",
  #     "searchString": "TITLE-ABS-KEY(proteins)",
  #     "sortType": "r-f",
  #     "origin": "resultslist",
  #     "originationType": "b",
  #     "clusterRowDisplayCount": "10",
  #     "resultsPerPage": "200",
  #     "resultsCount": "6,193,247",
  #     "eid": "null",
  #     "listId": "",
  #     "cite": "",
  #     "citedAuthorId": "",
  #     "citingId": "",
  #     "refeid": "",
  #     "ref": "",
  #     "recent": "",
  #     "recordIndex": "null"
  #   }

  #   r = requests.post("https://www.scopus.com/results/showallabstractspreview.uri", headers=headers, json=data)
  #   print(r.headers)
  #   print(r.text)
  #   return []