# **Scrape Google Scholar**

## **Installation Libraries**

In [None]:
!pip install requests
!pip install pandas
!pip install matplotlib
!pip install lxml 
!pip install beautifulsoup4
!pip install google-search-results 

## **Scraping**

In [None]:
import requests as req, lxml, os, json
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
urlScholarCode="qChltsQAAAAJ"
urlGoogleScholarProfile="https://scholar.google.co.id/citations?hl=en&user="

In [None]:
def get_headers():
  return { 
    'User-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
  }

In [None]:
def gs_parser(id_gs):
  text = req.get(urlGoogleScholarProfile+id_gs, headers=get_headers()).text
  soup = BeautifulSoup(text,'lxml')

  gsProfile = {
    'name':"",
    'university':"",
    'tags':[]
  }

  for profile in soup.select('#gsc_prf_w'):
    name = profile.select_one('#gsc_prf_in').text
    university = profile.select_one('.gsc_prf_il').text
    
    tags = []
    for tag in soup.select('#gsc_prf_int a'):
      tags.append(tag.text)
  
    gsProfile["name"]=name
    gsProfile["university"]=university
    gsProfile["tags"]=tags

  gsJournal = {
    'row':[]
  }
  
  rows = []
  
  rows = {
    'title':[],
    'link':[],
    'authors':[],
    'publisher':[],
    'pub_year':[],
    'cite':[],
    'year':[]
  }

  for rowJournals in soup.select('#gsc_a_b'):

    rowTitle = []
    rowLink = []
    rowAuthor = []
    rowPublisher = []
    rowPubYear = []

    for listJournals in rowJournals.select('.gsc_a_t'):

      rowTitle.append(listJournals.select_one('a').text)
      rowLink.append(listJournals.select_one('a')['href'])
      for index, authPub in enumerate(listJournals.select('.gs_gray')):
        if index == 0:
          rowAuthor.append([authPub.text])
        else:
          rowPublisher.append(authPub.text)
      
      rowPubYear.append(listJournals.select_one('.gs_oph').text.replace(', ',''))
    
    rowCite = []
    for listCite in rowJournals.select('.gsc_a_c'):
      rowCite.append(listCite.select_one('a').text)
      
    rowYear = []
    for listYear in rowJournals.select('.gsc_a_y'):
      rowYear.append(listYear.select_one('span').text)
    
    rows['title'] = rowTitle
    rows['link'] = rowLink
    rows['authors'] = rowAuthor
    rows['publisher'] = rowPublisher
    rows['pub_year'] = rowPubYear
    rows['cite'] = rowCite
    rows['year'] = rowYear

  gsObject = {
    'profile':{},
    'journal':{}
  }

  gsObject['profile'] = gsProfile
  gsObject['journal'] = rows

  return gsObject


In [None]:
def gs_parser_pandas(id_gs):
  text = req.get(urlGoogleScholarProfile+id_gs, headers=get_headers()).text
  
  soup = BeautifulSoup(text,'lxml')
  gsProfile = {
    'name':"",
    'university':"",
    'tags':[]
  }

  for profile in soup.select('#gsc_prf_w'):
    name = profile.select_one('#gsc_prf_in').text
    university = profile.select_one('.gsc_prf_il').text
    
    tags = []
    for tag in soup.select('#gsc_prf_int a'):
      tags.append(tag.text)
  
    gsProfile["name"]=name
    gsProfile["university"]=university
    gsProfile["tags"]=tags
  
  print(gsProfile)
  
  dfs = pd.read_html(text)
  
  for data in dfs:
    print(data)

In [None]:
def scrape_google_scholar(idGoogleScholar):
  print(gs_parser(idGoogleScholar))

In [None]:
import time

startOne = time.time()
scrape_google_scholar(urlScholarCode)
endOne = time.time()

startTwo = time.time()
gs_parser_pandas(urlScholarCode)
endTwo = time.time()

print("soup : {}".format(endOne - startOne))
print("pandas : {}".format(endTwo - startTwo))

{'profile': {'name': 'Marli Candra', 'university': 'UIN Sunan Ampel', 'tags': ['Victimology', 'Penology', 'Islamic Criminal Law']}, 'journal': {'title': ['Pengantar Studi Islam', 'Victim precipitation dalam Tindak Pidana Pencurian', 'The Penology of Islamic Criminal Law: Reintroduction of Islamic Penology', 'Tinjauan Viktimologi Terhadap Hak Perlindungan Penyalahgunaan Narkotika (Victimless Crime)', 'The Penology of Death Punishment (An Analytical Study Indonesian and Islamic Criminal Law)', 'Polarisasi berita bohong COVID-19: Viktimisasi kolektif', 'VICTIM PRECIPITATION DALAM TINDAK PIDANA PENCURIAN (SEBUAH PENDEKATAN VIKTIMOLOGI)', 'Urgensi DSN-MUI Sebagai Otoritas Pengawas Syariah Lembaga Keuangan Syariah', 'Tinjauan Filsafat Hukuman dalam Islam terhadap Overspel dan Zina', 'Limitation and Reduction Human Rights in Indonesia Through Substitute Government Regulations', 'Death penalty under hudud and qisas from modern penologys perspective'], 'link': ['/citations?view_op=view_citation

## **Visualization**

## **Extracting**