# Web scraping of Google Scholar: parts 9-12

The following code performs web scraping of Google Scholar using the Python libraries Beautiful Soup and Requests. Scraping papers and co-authors is our main objective.

In [1]:
import numpy as np
import pandas as pd

import requests
import time
from time import sleep
from datetime import datetime
from random import randint

import os

from bs4 import BeautifulSoup
import lxml

In [2]:
# Create results folder
route0 = "../web_scraping_data"

if not os.path.exists(route0):
    os.mkdir(route0)

## Load HCPs

In [2]:
# Load the file with HCP data
hcp_df = pd.read_excel("../novartis_data/BC & Melanoma targets Sweden.xlsx")

# Detect copyrights, it's separated by NA line
ind_cpr = hcp_df.loc[pd.isna(hcp_df["Name"]), :].index[0]

# Remove copyrights and duplicates
hcp_names_all = pd.Series(hcp_df["Name"].head(ind_cpr).unique())

## Querying Google Scholar using Beautiful Soup and Requests

Since we do the scraping of Google Scholar not in one block, but in 16 blocks of 13 HCPs each, we now write a function that does the scraping for any single of the 16 blocks.

In [5]:
def gs_bs_scraping(hcp_names, position):

    start_time_d = datetime.now()
    start_time_t = time.time()

    papers_df = []

    for i in range(len(hcp_names)): # Level: HCPs
        results_num = 0
        for j in [0,10,20]: # Level: Page
            sleep(randint(1,5))
            name_i = hcp_names.loc[i]
            query_i = '"{}"'.format(name_i) # put the doctor's name in quotes 
            print(f"{(position - 1)*len(hcp_names) + i + 1}/{len(hcp_names_all)} --- {query_i}")
            print(f"{j + 1}-{j + 10}")

            headers = {
            'User-agent':
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
            }

            proxies = {
            'http': os.getenv('HTTP_PROXY')
            }

            params = {
            "q": query_i, 
            "hl": "en",
            "start": j
            }

            html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params, proxies=proxies).text
        
            soup = BeautifulSoup(html, 'lxml')

            # Check how many articles there are
            results_i = soup.select('.gs_ri')
            results_num = results_num + len([x for x in results_i])

            if results_num > 0:
                results_i = soup.select('.gs_ri')
            else:
                results_i = []

            # Loop over retrieved articles
            papers_df_i = []
            for article in results_i: # Level: Articles
                try:
                    dict_i = {
                        "title": article.select_one('.gs_rt').text,
                        "title_link": article.select_one('.gs_rt a')['href'],
                        "authors" : article.select_one('.gs_a').text.split("- ")[0],
                        "publications" : article.select_one('.gs_a').text.split("- ")[1],
                        "snippet" : article.select_one('.gs_rs').text
                    }
                except:
                    pass

                df_i = pd.DataFrame.from_dict([dict_i])
                df_i["hcp_name"] = name_i
                df_i["num_articles"] = results_num

                papers_df_i.append(df_i)

            if len(papers_df_i) > 0:
                papers_df_i = pd.concat(papers_df_i)
                papers_df.append(papers_df_i)
            else:
                papers_df_i = pd.DataFrame({
                    "hcp_name": [name_i],
                    "num_articles": [results_num]
                })
                papers_df.append(papers_df_i)

        # Store data at the end
        if i == len(hcp_names) - 1:
            print(f"saving file corresponding to results_queries_gs_{position}.csv")
            papers_df = pd.concat(papers_df).reset_index(drop = True)
            papers_df.to_csv(f"{route0}/results_queries_gs_{position}.csv", index = False)
        
    end_time_d = datetime.now()
    end_time_t = time.time()
    duration = end_time_t - start_time_t
    print(f"start time: {start_time_d}")
    print(f"end time: {end_time_d}")
    print(f"duration: {int(np.floor(duration/3600))} hours {(duration-3600*np.floor(duration/3600))/60:.4} minutes")

### Part 9/16: Doctors 105-117

In [6]:
# 9th part
hcp_names = hcp_names_all[104:117].reset_index(drop = True)
hcp_names

0        Renske Altena
1       Reza Khoshnoud
2     Roger Henriksson
3        Sara Margolin
4      Sigrid Karstorp
5      Thomas Edekling
6      Thomas Hatschek
7       Tobias Lekberg
8      Tommy Fornander
9          Ulrik Narbe
10     Ulrika Palenius
11       Ylva Sandeder
12    Zakaria Einbeigi
dtype: object

In [7]:
# Run function
gs_bs_scraping(hcp_names, 9)

105/208 --- "Renske Altena"
1-10
105/208 --- "Renske Altena"
11-20
105/208 --- "Renske Altena"
21-30
106/208 --- "Reza Khoshnoud"
1-10
106/208 --- "Reza Khoshnoud"
11-20
106/208 --- "Reza Khoshnoud"
21-30
107/208 --- "Roger Henriksson"
1-10
107/208 --- "Roger Henriksson"
11-20
107/208 --- "Roger Henriksson"
21-30
108/208 --- "Sara Margolin"
1-10
108/208 --- "Sara Margolin"
11-20
108/208 --- "Sara Margolin"
21-30
109/208 --- "Sigrid Karstorp"
1-10
109/208 --- "Sigrid Karstorp"
11-20
109/208 --- "Sigrid Karstorp"
21-30
110/208 --- "Thomas Edekling"
1-10
110/208 --- "Thomas Edekling"
11-20
110/208 --- "Thomas Edekling"
21-30
111/208 --- "Thomas Hatschek"
1-10
111/208 --- "Thomas Hatschek"
11-20
111/208 --- "Thomas Hatschek"
21-30
112/208 --- "Tobias Lekberg"
1-10
112/208 --- "Tobias Lekberg"
11-20
112/208 --- "Tobias Lekberg"
21-30
113/208 --- "Tommy Fornander"
1-10
113/208 --- "Tommy Fornander"
11-20
113/208 --- "Tommy Fornander"
21-30
114/208 --- "Ulrik Narbe"
1-10
114/208 --- "Ulrik Na

In [5]:
results_queries_gs_9 = pd.read_csv(f"{route0}/results_queries_gs_9.csv")
results_queries_gs_9

Unnamed: 0,title,title_link,authors,publications,snippet,hcp_name,num_articles
0,[HTML][HTML] Cardiovascular toxicity caused by...,https://www.sciencedirect.com/science/article/...,"R Altena, PJ Perik, DJ Van Veldhuisen, EGE De ...","The lancet oncology, 2009",Cardiovascular toxicity is one of the most dev...,Renske Altena,10
1,"[HTML][HTML] Maintaining success, reducing tre...",https://www.sciencedirect.com/science/article/...,"J Beyer, P Albers, R Altena, J Aparicio, C Bok...","Annals of oncology, 2013","In November 2011, the Third European Consensus...",Renske Altena,10
2,Angiotensin II–receptor inhibition with candes...,https://jamanetwork.com/journals/jamaoncology/...,"AH Boekhout, JA Gietema, BM Kerklaan…","JAMA …, 2016",Importance This is the first randomized placeb...,Renske Altena,10
3,[HTML][HTML] Clinical challenges related to no...,https://www.ncbi.nlm.nih.gov/pmc/articles/pmc3...,"R Altena, E van Roon, R Folkeringa, H de Wit…","…, 2014",Figure 1. Median (5th and 95th percentiles) pl...,Renske Altena,10
4,[HTML][HTML] Growth differentiation factor 15 ...,https://journals.plos.org/plosone/article?id=1...,"R Altena, RSN Fehrmann, H Boer, EGE de Vries…","PloS one, 2015",Introduction Chemotherapy-related endothelial ...,Renske Altena,10
5,Sickness absence and disability pension among ...,https://onlinelibrary.wiley.com/doi/abs/10.111...,"SAM Gernaat, A Johnsson, R Altena…","European Journal of …, 2021",Objective We aimed to determine the longitudin...,Renske Altena,10
6,Long-term favorable effects of physical exerci...,https://journals.sagepub.com/doi/abs/10.1177/1...,"F Wiggenraad, KA Bolam, S Mijwel…","Integrative cancer …, 2020",Purpose: We evaluate longitudinal changes in s...,Renske Altena,10
7,[HTML][HTML] Metastatic Anaplastic Thyroid Car...,https://link.springer.com/article/10.1007/s120...,"A Stenman, LS Hellgren, K Jatta, M Hysek…","Endocrine …, 2020",Anaplastic thyroid carcinoma (ATC) exhibits an...,Renske Altena,10
8,[HTML][HTML] Single-nucleotide polymorphism in...,https://www.sciencedirect.com/science/article/...,"H Boer, NDL Westerink, R Altena, J Nuver…","European Journal of …, 2016",Purpose Chemotherapy-treated testicular cancer...,Renske Altena,10
9,[HTML][HTML] Evidence-based prediction and pre...,https://cardiooncologyjournal.biomedcentral.co...,"R Altena, L Hubbert, NA Kiani…","Cardio …, 2021",Cancer treatment-related morbidity relevantly ...,Renske Altena,10


Wait at least 30 minutes.

### Part 10/16: Doctors 118-130

In [9]:
# 10th part
hcp_names = hcp_names_all[117:130].reset_index(drop = True)
hcp_names

0       Irma Fredriksson
1            Paul Holmer
2     Christine Lundgren
3             Sara Wirén
4          Emma Sjöström
5        Johan Falkenius
6        Alexios Matikas
7       Sandra Cedervall
8          Anneli Fahlen
9        Helene Almström
10    Hildur Helgadottir
11       Gustav Ullenhag
12           Ylva Naeser
dtype: object

In [10]:
# Run function
gs_bs_scraping(hcp_names, 10)

118/208 --- "Irma Fredriksson"
1-10
118/208 --- "Irma Fredriksson"
11-20
118/208 --- "Irma Fredriksson"
21-30
119/208 --- "Paul Holmer"
1-10
119/208 --- "Paul Holmer"
11-20
119/208 --- "Paul Holmer"
21-30
120/208 --- "Christine Lundgren"
1-10
120/208 --- "Christine Lundgren"
11-20
120/208 --- "Christine Lundgren"
21-30
121/208 --- "Sara Wirén"
1-10
121/208 --- "Sara Wirén"
11-20
121/208 --- "Sara Wirén"
21-30
122/208 --- "Emma Sjöström"
1-10
122/208 --- "Emma Sjöström"
11-20
122/208 --- "Emma Sjöström"
21-30
123/208 --- "Johan Falkenius"
1-10
123/208 --- "Johan Falkenius"
11-20
123/208 --- "Johan Falkenius"
21-30
124/208 --- "Alexios Matikas"
1-10
124/208 --- "Alexios Matikas"
11-20
124/208 --- "Alexios Matikas"
21-30
125/208 --- "Sandra Cedervall"
1-10
125/208 --- "Sandra Cedervall"
11-20
125/208 --- "Sandra Cedervall"
21-30
126/208 --- "Anneli Fahlen"
1-10
126/208 --- "Anneli Fahlen"
11-20
126/208 --- "Anneli Fahlen"
21-30
127/208 --- "Helene Almström"
1-10
127/208 --- "Helene Almstr

In [6]:
results_queries_gs_10 = pd.read_csv(f"{route0}/results_queries_gs_10.csv")
results_queries_gs_10

Unnamed: 0,title,title_link,authors,publications,snippet,hcp_name,num_articles
0,[HTML][HTML] Breast cancer in young women: poo...,https://journals.plos.org/plosone/article?id=1...,"H Fredholm, S Eaker, J Frisell, L Holmberg…","PloS one, 2009",Background Breast cancer is uncommon in young ...,Irma Fredriksson,10
1,Mammography screening reduces rates of advance...,https://acsjournals.onlinelibrary.wiley.com/do...,"SW Duffy, L Tabár, AMF Yen, PB Dean, RA Smith…","Cancer, 2020",Background It is of paramount importance to ev...,Irma Fredriksson,10
2,Cancer during pregnancy and the postpartum per...,https://acsjournals.onlinelibrary.wiley.com/do...,"TML Andersson, ALV Johansson, I Fredriksson…","Cancer, 2015",BACKGROUND The purpose of this study was to as...,Irma Fredriksson,10
3,[HTML][HTML] Sequencing of breast cancer stem ...,https://link.springer.com/article/10.1186/bcr3687,"D Klevebring, G Rosin, R Ma, J Lindberg, K Cze...","Breast cancer …, 2014",The cancer stem cell model implies a hierarchi...,Irma Fredriksson,10
4,[HTML][HTML] Long-term outcome in young women ...,https://link.springer.com/article/10.1007/s105...,"H Fredholm, K Magnusson, LS Lindström…","Breast cancer research …, 2016",Purpose Whether young age at diagnosis of brea...,Irma Fredriksson,10
5,[PDF][PDF] Estrogen receptor β as a therapeuti...,https://academic.oup.com/jnci/article-pdf/doi/...,"R Ma, GM Karthik, J Lövrot, F Haglund…","JNCI: Journal of the …, 2017",Background: Breast cancer cells with tumor-ini...,Irma Fredriksson,10
6,Tumor characteristics and prognosis in women w...,https://onlinelibrary.wiley.com/doi/abs/10.100...,"ALV Johansson, TML Andersson…","… journal of cancer, 2018",There is evidence of poor prognosis in women w...,Irma Fredriksson,10
7,[HTML][HTML] Identification and validation of ...,https://link.springer.com/article/10.1186/s130...,"M Sjöström, J Staaf, P Edén, F Wärnberg, J Ber...","Breast Cancer …, 2018",Adjuvant radiotherapy is the standard of care ...,Irma Fredriksson,10
8,[HTML][HTML] Gene expression profiling in prim...,https://breast-cancer-research.biomedcentral.c...,"E Niméus-Malmström, M Krogh…","Breast Cancer …, 2008",Some patients with breast cancer develop local...,Irma Fredriksson,10
9,[HTML][HTML] mTOR inhibitors counteract tamoxi...,https://www.sciencedirect.com/science/article/...,"GM Karthik, R Ma, J Lövrot, LL Kis, C Lindh…","Cancer letters, 2015",Breast cancer cells with stem cell characteris...,Irma Fredriksson,10


Wait at least 30 minutes.

### Part 11/16: Doctors 131-143

In [12]:
# 11th part
hcp_names = hcp_names_all[130:143].reset_index(drop = True)
hcp_names

0                Anthoula Koliadi
1                    Karin Hallén
2                    Sara Tärnbro
3                 Charlotte Levin
4           Georgios Kitsolampros
5               Braslav Jovanovic
6          Fernanda Costa Svedman
7                Giuseppe Masucci
8     Lisa Elena Esther Villabona
9                Maria Wolodarski
10                     Elin Jänes
11                  Petra Flygare
12          Dimitrios Papantoniou
dtype: object

In [13]:
# Run function
gs_bs_scraping(hcp_names, 11)

131/208 --- "Anthoula Koliadi"
1-10
131/208 --- "Anthoula Koliadi"
11-20
131/208 --- "Anthoula Koliadi"
21-30
132/208 --- "Karin Hallén"
1-10
132/208 --- "Karin Hallén"
11-20
132/208 --- "Karin Hallén"
21-30
133/208 --- "Sara Tärnbro"
1-10
133/208 --- "Sara Tärnbro"
11-20
133/208 --- "Sara Tärnbro"
21-30
134/208 --- "Charlotte Levin"
1-10
134/208 --- "Charlotte Levin"
11-20
134/208 --- "Charlotte Levin"
21-30
135/208 --- "Georgios Kitsolampros"
1-10
135/208 --- "Georgios Kitsolampros"
11-20
135/208 --- "Georgios Kitsolampros"
21-30
136/208 --- "Braslav Jovanovic"
1-10
136/208 --- "Braslav Jovanovic"
11-20
136/208 --- "Braslav Jovanovic"
21-30
137/208 --- "Fernanda Costa Svedman"
1-10
137/208 --- "Fernanda Costa Svedman"
11-20
137/208 --- "Fernanda Costa Svedman"
21-30
138/208 --- "Giuseppe Masucci"
1-10
138/208 --- "Giuseppe Masucci"
11-20
138/208 --- "Giuseppe Masucci"
21-30
139/208 --- "Lisa Elena Esther Villabona"
1-10
139/208 --- "Lisa Elena Esther Villabona"
11-20
139/208 --- "Lis

In [7]:
results_queries_gs_11 = pd.read_csv(f"{route0}/results_queries_gs_11.csv")
results_queries_gs_11

Unnamed: 0,title,title_link,authors,publications,snippet,hcp_name,num_articles
0,Cyclin B1 is a prognostic proliferation marker...,https://onlinelibrary.wiley.com/doi/abs/10.100...,"E Niméus‐Malmström, A Koliadi, C Ahlin…","… journal of cancer, 2010",A large proportion of women with lymph node ne...,Anthoula Koliadi,10
1,[HTML][HTML] High proliferation is associated ...,https://www.nature.com/articles/modpathol2012145,"C Nilsson, A Koliadi, I Johansson, C Ahlin…","Modern …, 2013",Assessment of proliferation is important in fe...,Anthoula Koliadi,10
2,Complete response with combined BRAF and MEK i...,https://www.tandfonline.com/doi/abs/10.1080/03...,"B Tholander, A Koliadi, J Botling…","Upsala Journal of …, 2020",More effective treatments are needed for low-g...,Anthoula Koliadi,10
3,Cyclin B is an immunohistochemical proliferati...,https://www.tandfonline.com/doi/abs/10.3109/02...,"A Koliadi, C Nilsson, M Holmqvist, L Holmberg…","Acta …, 2010",Patients with low-risk node negative breast ca...,Anthoula Koliadi,10
4,Parity is associated with better prognosis in ...,https://onlinelibrary.wiley.com/doi/abs/10.100...,"C Sköld, A Koliadi, G Enblad…","… Journal of Cancer, 2022",Ovarian cancer is influenced by reproductive f...,Anthoula Koliadi,10
5,Improved survival without increased toxicity w...,https://www.tandfonline.com/doi/abs/10.1080/21...,"A Valachis, C Rosén, A Koliadi, E Digkas…","…, 2021","In international guidelines, influenza vaccina...",Anthoula Koliadi,10
6,[HTML][HTML] The Prognostic Impact of Prolifer...,https://www.diva-portal.org/smash/record.jsf?p...,A Koliadi,2014,In paper IV we applied the immunohistochemicha...,Anthoula Koliadi,10
7,[HTML][HTML] Cyclin A is an excellent prolifer...,https://www.diva-portal.org/smash/record.jsf?p...,A Koliadi,2014,Background. Gene arrays have demonstrated diff...,Anthoula Koliadi,10
8,[HTML][HTML] PPH3 is an independent prognostic...,https://www.diva-portal.org/smash/record.jsf?p...,A Koliadi,2014,Background. Proliferation conveys prognostic i...,Anthoula Koliadi,10
9,"[CITATION][C] Association between parity, hist...",https://www.diva-portal.org/smash/record.jsf?p...,"C Sköld, A Tolf, S Corvigno, H Dahlstrand, K S...",2021,The record could not be found. The reason may ...,Anthoula Koliadi,10


Wait at least 30 minutes.

### Part 12/16: Doctors 144-156

In [15]:
# 12th part
hcp_names = hcp_names_all[143:156].reset_index(drop = True)
hcp_names

0     Ingrid Schampi Ljuslinder
1                Karin Papworth
2                     Max Levin
3                 Sara Bjursten
4       Kristin Sigurjonsdottir
5           Christine Jaredsson
6               Sander Ellegård
7                  Jan Rzepecki
8         Georgios Fountoukidis
9                   Elin Brodin
10               Michael Sihver
11              Zuzana Lovasová
12                Karin Engblom
dtype: object

In [16]:
# Run function
gs_bs_scraping(hcp_names, 12)

144/208 --- "Ingrid Schampi Ljuslinder"
1-10
144/208 --- "Ingrid Schampi Ljuslinder"
11-20
144/208 --- "Ingrid Schampi Ljuslinder"
21-30
145/208 --- "Karin Papworth"
1-10
145/208 --- "Karin Papworth"
11-20
145/208 --- "Karin Papworth"
21-30
146/208 --- "Max Levin"
1-10
146/208 --- "Max Levin"
11-20
146/208 --- "Max Levin"
21-30
147/208 --- "Sara Bjursten"
1-10
147/208 --- "Sara Bjursten"
11-20
147/208 --- "Sara Bjursten"
21-30
148/208 --- "Kristin Sigurjonsdottir"
1-10
148/208 --- "Kristin Sigurjonsdottir"
11-20
148/208 --- "Kristin Sigurjonsdottir"
21-30
149/208 --- "Christine Jaredsson"
1-10
149/208 --- "Christine Jaredsson"
11-20
149/208 --- "Christine Jaredsson"
21-30
150/208 --- "Sander Ellegård"
1-10
150/208 --- "Sander Ellegård"
11-20
150/208 --- "Sander Ellegård"
21-30
151/208 --- "Jan Rzepecki"
1-10
151/208 --- "Jan Rzepecki"
11-20
151/208 --- "Jan Rzepecki"
21-30
152/208 --- "Georgios Fountoukidis"
1-10
152/208 --- "Georgios Fountoukidis"
11-20
152/208 --- "Georgios Fountouki

In [8]:
results_queries_gs_12 = pd.read_csv(f"{route0}/results_queries_gs_12.csv")
results_queries_gs_12

Unnamed: 0,hcp_name,num_articles,title,title_link,authors,publications,snippet
0,Ingrid Schampi Ljuslinder,0,,,,,
1,Ingrid Schampi Ljuslinder,0,,,,,
2,Ingrid Schampi Ljuslinder,0,,,,,
3,Karin Papworth,10,"[HTML][HTML] Adjuvant vemurafenib in resected,...",https://www.sciencedirect.com/science/article/...,"M Maio, K Lewis, L Demidov, M Mandalà…","The Lancet …, 2018",Background Systemic adjuvant treatment might m...
4,Karin Papworth,10,Increasing incidence of primary central nervou...,https://onlinelibrary.wiley.com/doi/abs/10.111...,"S Eloranta, E Brånvall, F Celsing…","European journal of …, 2018",Objectives This study aims to characterize the...
5,Karin Papworth,10,Family history of cancer and childhood rhabdom...,https://onlinelibrary.wiley.com/doi/abs/10.100...,"PJ Lupo, HE Danysh, SE Plon, K Curtin…","Cancer …, 2015",Relatively little is known about the epidemiol...
6,Karin Papworth,10,[HTML][HTML] Adjuvant chemotherapy and postope...,https://www.sciencedirect.com/science/article/...,"KS Hall, ØS Bruland, B Bjerkehagen, O Zaikova…","European Journal of …, 2018",Purpose To investigate the outcome following a...
7,Karin Papworth,10,Valproate in combination with rituximab and CH...,https://ashpublications.org/bloodadvances/arti...,"K Drott, H Hagberg, K Papworth, T Relander…","Blood …, 2018",The aims of the present study were to establis...
8,Karin Papworth,10,Soft‐tissue sarcoma in adolescents and young a...,https://acsjournals.onlinelibrary.wiley.com/do...,"KE Papworth, VM Arroyo, E Styring, O Zaikova…","Cancer, 2019","Background In recent years, there has been gro..."
9,Karin Papworth,10,Perinatal and familial risk factors for soft t...,https://onlinelibrary.wiley.com/doi/abs/10.100...,"PJ Lupo, RE Luna‐Gierke…","… journal of cancer, 2020",Perinatal factors have been associated with so...


Wait at least 30 minutes.