# Web scraping of Google Scholar: parts 1-4

The following code performs web scraping of Google Scholar using the Python libraries Beautiful Soup and Requests. Scraping papers and co-authors is our main objective.

In [1]:
import numpy as np
import pandas as pd

import requests
import time
from time import sleep
from datetime import datetime
from random import randint

import os

from bs4 import BeautifulSoup
import lxml

In [2]:
# Create results folder
route0 = "../web_scraping_data"

if not os.path.exists(route0):
    os.mkdir(route0)

## Load HCPs

In [2]:
# Load the file with HCP data
hcp_df = pd.read_excel("../novartis_data/BC & Melanoma targets Sweden.xlsx")

# Detect copyrights, it's separated by NA line
ind_cpr = hcp_df.loc[pd.isna(hcp_df["Name"]), :].index[0]

# Remove copyrights and duplicates
hcp_names_all = pd.Series(hcp_df["Name"].head(ind_cpr).unique())

## Querying Google Scholar using Beautiful Soup and Requests

Since we do the scraping of Google Scholar not in one block, but in 16 blocks of 13 HCPs each, we now write a function that does the scraping for any single of the 16 blocks.

In [45]:
def gs_bs_scraping(hcp_names, position):

    start_time_d = datetime.now()
    start_time_t = time.time()

    papers_df = []

    for i in range(len(hcp_names)): # Level: HCPs
        results_num = 0
        for j in [0,10,20]: # Level: Page
            sleep(randint(1,5))
            name_i = hcp_names.loc[i]
            query_i = '"{}"'.format(name_i) # put the doctor's name in quotes 
            print(f"{(position - 1)*len(hcp_names) + i + 1}/{len(hcp_names_all)} --- {query_i}")
            print(f"{j + 1}-{j + 10}")

            headers = {
            'User-agent':
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
            }

            proxies = {
            'http': os.getenv('HTTP_PROXY')
            }

            params = {
            "q": query_i, 
            "hl": "en",
            "start": j
            }

            html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params, proxies=proxies).text
        
            soup = BeautifulSoup(html, 'lxml')

            # Check how many articles there are
            results_i = soup.select('.gs_ri')
            results_num = results_num + len([x for x in results_i])

            if results_num > 0:
                results_i = soup.select('.gs_ri')
            else:
                results_i = []

            # Loop over retrieved articles
            papers_df_i = []
            for article in results_i: # Level: Articles
                try:
                    dict_i = {
                        "title": article.select_one('.gs_rt').text,
                        "title_link": article.select_one('.gs_rt a')['href'],
                        "authors" : article.select_one('.gs_a').text.split("- ")[0],
                        "publications" : article.select_one('.gs_a').text.split("- ")[1],
                        "snippet" : article.select_one('.gs_rs').text
                    }
                except:
                    pass

                df_i = pd.DataFrame.from_dict([dict_i])
                df_i["hcp_name"] = name_i
                df_i["num_articles"] = results_num

                papers_df_i.append(df_i)

            if len(papers_df_i) > 0:
                papers_df_i = pd.concat(papers_df_i)
                papers_df.append(papers_df_i)
            else:
                papers_df_i = pd.DataFrame({
                    "hcp_name": [name_i],
                    "num_articles": [results_num]
                })
                papers_df.append(papers_df_i)

        # Store data at the end
        if i == len(hcp_names) - 1:
            print(f"saving file corresponding to results_queries_gs_{position}.csv")
            papers_df = pd.concat(papers_df).reset_index(drop = True)
            papers_df.to_csv(f"{route0}/results_queries_gs_{position}.csv", index = False)
        
    end_time_d = datetime.now()
    end_time_t = time.time()
    duration = end_time_t - start_time_t
    print(f"start time: {start_time_d}")
    print(f"end time: {end_time_d}")
    print(f"duration: {int(np.floor(duration/3600))} hours {(duration-3600*np.floor(duration/3600))/60:.4} minutes")

### Part 1/16: Doctors 1-13

In [46]:
# 1st part
hcp_names = hcp_names_all[0:13].reset_index(drop = True)
hcp_names

0               Adel Bader Hamdalla
1                     Aglaia Schiza
2             Agneta Nordin Danfors
3           Ahmed Abbas Albu-Kareem
4                       Alaa Haidar
5                  Ana Bosch Campos
6                  Andreas Nearchou
7                  Ulrika Bergqvist
8           Ann Charlotte Dreifaldt
9               Elisabeth Ryd Ausén
10                 Marie Santonsson
11                Anna Nordenskjöld
12    Anna von Wachenfeldt Väppling
dtype: object

In [47]:
# Run function
gs_bs_scraping(hcp_names, 1)

1/208 --- "Adel Bader Hamdalla"
1-10
1/208 --- "Adel Bader Hamdalla"
11-20
1/208 --- "Adel Bader Hamdalla"
21-30
2/208 --- "Aglaia Schiza"
1-10
2/208 --- "Aglaia Schiza"
11-20
2/208 --- "Aglaia Schiza"
21-30
3/208 --- "Agneta Nordin Danfors"
1-10
3/208 --- "Agneta Nordin Danfors"
11-20
3/208 --- "Agneta Nordin Danfors"
21-30
4/208 --- "Ahmed Abbas Albu-Kareem"
1-10
4/208 --- "Ahmed Abbas Albu-Kareem"
11-20
4/208 --- "Ahmed Abbas Albu-Kareem"
21-30
5/208 --- "Alaa Haidar"
1-10
5/208 --- "Alaa Haidar"
11-20
5/208 --- "Alaa Haidar"
21-30
6/208 --- "Ana Bosch Campos"
1-10
6/208 --- "Ana Bosch Campos"
11-20
6/208 --- "Ana Bosch Campos"
21-30
7/208 --- "Andreas Nearchou"
1-10
7/208 --- "Andreas Nearchou"
11-20
7/208 --- "Andreas Nearchou"
21-30
8/208 --- "Ulrika Bergqvist"
1-10
8/208 --- "Ulrika Bergqvist"
11-20
8/208 --- "Ulrika Bergqvist"
21-30
9/208 --- "Ann Charlotte Dreifaldt"
1-10
9/208 --- "Ann Charlotte Dreifaldt"
11-20
9/208 --- "Ann Charlotte Dreifaldt"
21-30
10/208 --- "Elisabeth 

In [57]:
results_queries_gs_1 = pd.read_csv(f"{route0}/results_queries_gs_1.csv")
results_queries_gs_1

Unnamed: 0,hcp_name,num_articles,title,title_link,authors,publications,snippet
0,Adel Bader Hamdalla,0,,,,,
1,Adel Bader Hamdalla,0,,,,,
2,Adel Bader Hamdalla,0,,,,,
3,Aglaia Schiza,10,"Neoadjuvant trastuzumab, pertuzumab, and docet...",https://jamanetwork.com/journals/jamaoncology/...,"T Hatschek, T Foukakis, J Bjöhle, T Lekberg…","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...
4,Aglaia Schiza,10,[HTML][HTML] Adenovirus-mediated CD40L gene tr...,https://translational-medicine.biomedcentral.c...,"A Schiza, J Wenthe, S Mangsbo…","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...
5,Aglaia Schiza,10,High PDGFRb Expression Predicts Resistance to ...,https://clincancerres.aacrjournals.org/content...,"C Strell, D Folkvaljon, E Holmberg, A Schiza…","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...
6,Aglaia Schiza,10,[HTML][HTML] Evaluation of diffusion-weighted ...,https://www.nature.com/articles/s41598-019-544...,"A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog…","Scientific reports, 2019",The purpose was to evaluate the potential of d...
7,Aglaia Schiza,10,[HTML][HTML] Local irradiation does not enhanc...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,"S Irenaeus, A Schiza, SM Mangsbo, J Wenthe…","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...
8,Aglaia Schiza,10,[HTML][HTML] Predictive role of HER2-status on...,https://link.springer.com/article/10.1007/s105...,"A Schiza, D Mauri, I Fredriksson, AK Wennstig…","Breast Cancer Research …, 2021",Purpose There are conflicting results on the p...
9,Aglaia Schiza,10,Abstract PO-018: Inflaming advanced solid tumo...,https://cancerres.aacrjournals.org/content/81/...,"J Wenthe, E Eriksson, L Sandin, T Lövgren, JL ...",2021,Pancreatic ductal adenocarcinoma (PDAC) is res...


Wait at least 30 minutes.

### Part 2/16: Doctors 14-26

In [60]:
# 2nd part
hcp_names = hcp_names_all[13:26].reset_index(drop = True)
hcp_names

0              Anna-Karin Tzikas
1            Anna-Karin Wennstig
2     Anna Maria Hasselgren Häll
3        Anne-Kristine Andersson
4              Antonios Valachis
5              Elisabet Karlsson
6              Barbro Linderholm
7                  Birgitta Lind
8              Kristina Lindblom
9                Maria Sandström
10              Cecilia Graffman
11               Cecilia Nilsson
12             Chaido Chamalidou
dtype: object

In [61]:
# Run function
gs_bs_scraping(hcp_names, 2)

14/208 --- "Anna-Karin Tzikas"
1-10
14/208 --- "Anna-Karin Tzikas"
11-20
14/208 --- "Anna-Karin Tzikas"
21-30
15/208 --- "Anna-Karin Wennstig"
1-10
15/208 --- "Anna-Karin Wennstig"
11-20
15/208 --- "Anna-Karin Wennstig"
21-30
16/208 --- "Anna Maria Hasselgren Häll"
1-10
16/208 --- "Anna Maria Hasselgren Häll"
11-20
16/208 --- "Anna Maria Hasselgren Häll"
21-30
17/208 --- "Anne-Kristine Andersson"
1-10
17/208 --- "Anne-Kristine Andersson"
11-20
17/208 --- "Anne-Kristine Andersson"
21-30
18/208 --- "Antonios Valachis"
1-10
18/208 --- "Antonios Valachis"
11-20
18/208 --- "Antonios Valachis"
21-30
19/208 --- "Elisabet Karlsson"
1-10
19/208 --- "Elisabet Karlsson"
11-20
19/208 --- "Elisabet Karlsson"
21-30
20/208 --- "Barbro Linderholm"
1-10
20/208 --- "Barbro Linderholm"
11-20
20/208 --- "Barbro Linderholm"
21-30
21/208 --- "Birgitta Lind"
1-10
21/208 --- "Birgitta Lind"
11-20
21/208 --- "Birgitta Lind"
21-30
22/208 --- "Kristina Lindblom"
1-10
22/208 --- "Kristina Lindblom"
11-20
22/208 -

In [7]:
results_queries_gs_2 = pd.read_csv(f"{route0}/results_queries_gs_2.csv")
results_queries_gs_2 

Unnamed: 0,title,title_link,authors,publications,snippet,hcp_name,num_articles
0,[HTML][HTML] A comparison between young and ol...,https://link.springer.com/article/10.1007/s105...,"AK Tzikas, S Nemes, BK Linderholm","Breast Cancer Research and …, 2020","Purpose To determine the biology, recurrence r...",Anna-Karin Tzikas,3
1,Abstract P2-14-05: Adjuvant chemotherapy in el...,https://cancerres.aacrjournals.org/content/80/...,"S Janeva, C Zhang, A Kovács, TZ Parris, AK Tzi...",2020,Background: Breast cancer incidence is strongl...,Anna-Karin Tzikas,3
2,[HTML][HTML] Patient and tumour characteristic...,https://www.nature.com/articles/bjc201114,"AMG Ali, D Greenberg, GC Wishart…","British journal of cancer, 2011",Background: Breast cancer relative survival (B...,Anna-Karin Tzikas,3
3,,,,,,Anna-Karin Tzikas,3
4,,,,,,Anna-Karin Tzikas,3
5,[HTML][HTML] Patients' preferences for subcuta...,https://www.sciencedirect.com/science/article/...,"X Pivot, J Gligorov, V Müller, G Curigliano, A...","Annals of oncology, 2014",Background Patients with HER2-positive early b...,Anna-Karin Wennstig,10
6,[HTML][HTML] The relationship between radiatio...,https://ro-journal.biomedcentral.com/articles/...,"AK Wennstig, H Garmo, U Isacsson…","Radiation …, 2019",To assess the relationship between radiation d...,Anna-Karin Wennstig,10
7,[HTML][HTML] Inter-observer variation in delin...,https://www.sciencedirect.com/science/article/...,"AK Wennstig, H Garmo, P Hållström…","Radiotherapy and …, 2017",Purpose To determine the inter-observer variat...,Anna-Karin Wennstig,10
8,[HTML][HTML] Influenza vaccination in breast c...,https://link.springer.com/article/10.1007/s105...,"TB Joona, E Digkas, AK Wennstig, K Nyström…","Breast Cancer Research …, 2020",Background Despite the current recommendation ...,Anna-Karin Wennstig,10
9,"[PDF][PDF] Impact of chemotherapy, radiotherap...",https://link.springer.com/content/pdf/10.1007/...,"A Plym, ALV Johansson, H Bower, AK Wennstig…","Breast cancer research …, 2020",Purpose To examine the influence of type of on...,Anna-Karin Wennstig,10


Wait at least 30 minutes.

### Part 3/16: Doctors 27-39

In [64]:
# 3rd part
hcp_names = hcp_names_all[26:39].reset_index(drop = True)
hcp_names

0              Charlotte Bratthäll
1      Christina Haapaniemi Olsson
2     Christina Linder Stragliotto
3                  Malin Steenhoff
4                Therése Widerberg
5                 Claudia Lundgren
6                    Dan Lundstedt
7                    Dawid Bulanda
8                 Julia Hallerfelt
9            Yohana Collins Bikova
10                Yvonne Wengström
11               Elisabet Lidbrink
12       Elzbieta Wojtyna-Dziedzic
dtype: object

In [67]:
# Run function
gs_bs_scraping(hcp_names, 3)

27/208 --- "Charlotte Bratthäll"
1-10
27/208 --- "Charlotte Bratthäll"
11-20
27/208 --- "Charlotte Bratthäll"
21-30
28/208 --- "Christina Haapaniemi Olsson"
1-10
28/208 --- "Christina Haapaniemi Olsson"
11-20
28/208 --- "Christina Haapaniemi Olsson"
21-30
29/208 --- "Christina Linder Stragliotto"
1-10
29/208 --- "Christina Linder Stragliotto"
11-20
29/208 --- "Christina Linder Stragliotto"
21-30
30/208 --- "Malin Steenhoff"
1-10
30/208 --- "Malin Steenhoff"
11-20
30/208 --- "Malin Steenhoff"
21-30
31/208 --- "Therése Widerberg"
1-10
31/208 --- "Therése Widerberg"
11-20
31/208 --- "Therése Widerberg"
21-30
32/208 --- "Claudia Lundgren"
1-10
32/208 --- "Claudia Lundgren"
11-20
32/208 --- "Claudia Lundgren"
21-30
33/208 --- "Dan Lundstedt"
1-10
33/208 --- "Dan Lundstedt"
11-20
33/208 --- "Dan Lundstedt"
21-30
34/208 --- "Dawid Bulanda"
1-10
34/208 --- "Dawid Bulanda"
11-20
34/208 --- "Dawid Bulanda"
21-30
35/208 --- "Julia Hallerfelt"
1-10
35/208 --- "Julia Hallerfelt"
11-20
35/208 --- "J

In [8]:
results_queries_gs_3 = pd.read_csv(f"{route0}/results_queries_gs_3.csv")
results_queries_gs_3

Unnamed: 0,title,title_link,authors,publications,snippet,hcp_name,num_articles
0,[HTML][HTML] TERT promoter mutations and polym...,https://www.ncbi.nlm.nih.gov/pmc/articles/pmc4...,"MA Mosrati, A Malmström, M Lysiak, A Krysztofi...","Oncotarget, 2015",Telomerase reverse transcriptase (TERT) activi...,Charlotte Bratthäll,10
1,[HTML][HTML] The desmoplastic stroma plays an ...,https://www.hindawi.com/journals/cdi/2011/2128...,"V Tjomsland, L Niklasson, P Sandström…","Clinical and …, 2011",Tumor microenvironment is composed of tumor ce...,Charlotte Bratthäll,10
2,[HTML][HTML] Real world evidence on gemcitabin...,https://bmccancer.biomedcentral.com/articles/1...,"H Blomstrand, U Scheibling, C Bratthäll…","BMC …, 2019",In the recent phase III trial MPACT the combin...,Charlotte Bratthäll,10
3,[HTML][HTML] IL-1α expression in pancreatic du...,https://journals.plos.org/plosone/article?id=1...,"V Tjomsland, L Bojmar, P Sandström, C Bratthäll…","PLoS …, 2013",The interplay between the tumor cells and the ...,Charlotte Bratthäll,10
4,ABCB1 single-nucleotide variants and survival ...,https://www.nature.com/articles/s41397-019-0107-z,"A Malmström, M Łysiak, L Åkesson…","The …, 2020",Standard treatment for glioblastoma (GBM) pati...,Charlotte Bratthäll,10
5,Deletions on Chromosome Y and Downregulation o...,https://www.mdpi.com/2072-6694/13/7/1619,"M Łysiak, A Smits, KR Roodakker, E Sandberg…","Cancers, 2021",Background: Biological causes of sex disparity...,Charlotte Bratthäll,10
6,[HTML][HTML] Impact of PINCH expression on sur...,https://link.springer.com/article/10.1186/1471...,"J Lööf, J Rosell, C Bratthäll, S Doré, H Stark...","BMC cancer, 2011",The adaptor protein PINCH is overexpressed in ...,Charlotte Bratthäll,10
7,[HTML][HTML] Assessment of genetic and non-gen...,https://www.sciencedirect.com/science/article/...,"MK Heenkenda, A Malmström, M Lysiak, M Mudaisi…","Thrombosis research, 2019",Introduction Venous thromboembolism (VTE) is a...,Charlotte Bratthäll,10
8,MTR-09 ABCB1 as predictive marker for poor sur...,https://academic.oup.com/neuro-oncology/articl...,"A Malmström, M Hallbeck, V Fomichov, P Milos…","Neuro …, 2015",BACKGROUND: Induction of autophagy has been id...,Charlotte Bratthäll,10
9,ABO blood group is a potent risk factor for ve...,https://acsjournals.onlinelibrary.wiley.com/do...,"MB Streiff, J Segal, SA Grossman…","… Journal of the …, 2004",BACKGROUND Venous thromboembolism (VTE) is a c...,Charlotte Bratthäll,10


Wait at least 30 minutes.

### Part 4/16: Doctors 40-52

In [69]:
# 4th part
hcp_names = hcp_names_all[39:52].reset_index(drop = True)
hcp_names

0                   Eva af Trampe
1               Ann-Britt Nilsson
2                    Eva Tallroth
3                Evangelos Digkas
4              Fredrika Killander
5                 Gabriel Jonsson
6                 Gerhard Winblad
7                 Gilberto Morgan
8                   Git Martenhed
9                  Greger Nilsson
10              Gunnar Lengstrand
11              Marika Hjelmqvist
12    Helena Granstam Björneklett
dtype: object

In [70]:
# Run function
gs_bs_scraping(hcp_names, 4)

40/208 --- "Eva af Trampe"
1-10
40/208 --- "Eva af Trampe"
11-20
40/208 --- "Eva af Trampe"
21-30
41/208 --- "Ann-Britt Nilsson"
1-10
41/208 --- "Ann-Britt Nilsson"
11-20
41/208 --- "Ann-Britt Nilsson"
21-30
42/208 --- "Eva Tallroth"
1-10
42/208 --- "Eva Tallroth"
11-20
42/208 --- "Eva Tallroth"
21-30
43/208 --- "Evangelos Digkas"
1-10
43/208 --- "Evangelos Digkas"
11-20
43/208 --- "Evangelos Digkas"
21-30
44/208 --- "Fredrika Killander"
1-10
44/208 --- "Fredrika Killander"
11-20
44/208 --- "Fredrika Killander"
21-30
45/208 --- "Gabriel Jonsson"
1-10
45/208 --- "Gabriel Jonsson"
11-20
45/208 --- "Gabriel Jonsson"
21-30
46/208 --- "Gerhard Winblad"
1-10
46/208 --- "Gerhard Winblad"
11-20
46/208 --- "Gerhard Winblad"
21-30
47/208 --- "Gilberto Morgan"
1-10
47/208 --- "Gilberto Morgan"
11-20
47/208 --- "Gilberto Morgan"
21-30
48/208 --- "Git Martenhed"
1-10
48/208 --- "Git Martenhed"
11-20
48/208 --- "Git Martenhed"
21-30
49/208 --- "Greger Nilsson"
1-10
49/208 --- "Greger Nilsson"
11-20


In [9]:
results_queries_gs_4 = pd.read_csv(f"{route0}/results_queries_gs_4.csv")
results_queries_gs_4

Unnamed: 0,title,title_link,authors,publications,snippet,hcp_name,num_articles
0,Intravitreal chemotherapy for recurrent retino...,https://www.ncbi.nlm.nih.gov/pmc/articles/pmc5...,"S Seregard, E Kock, E af Trampe","The British journal of …, 1995",Recurrent retinoblastoma in the only remaining...,Eva af Trampe,10
1,Psychological reactions and quality of life in...,https://www.nature.com/articles/eye2000233,"Y Brandberg, E Kock, K Oskar, EA Trampe, S Ser...","Eye, 2000",Purpose To investigate psychological reactions...,Eva af Trampe,10
2,Results following episcleral ruthenium plaque ...,https://onlinelibrary.wiley.com/doi/abs/10.111...,"S Seregard, E af Trampe, I Lax, E Kock…","Acta Ophthalmologica …, 1997",The Swedish experience of ruthenium 106 plaque...,Eva af Trampe,10
3,Prevalence of primary acquired melanosis and n...,https://www.sciencedirect.com/science/article/...,"S Seregard, E af Trampe, E Månsson-Brahme, E K...","Ophthalmology, 1995",Purpose: To investigate whether conjunctival a...,Eva af Trampe,10
4,Standardized precision radiotherapy in choroid...,https://www.tandfonline.com/doi/abs/10.3109/02...,"U Nylén, E Kock, I Lax, G Lundell, EA Trampe…","Acta …, 1994",Metastases in the choroid of the eye are frequ...,Eva af Trampe,10
5,A prospective study of children treated for re...,https://onlinelibrary.wiley.com/doi/abs/10.103...,"U Ek, S Seregard, L Jacobson, K Oskar…","Acta …, 2002",Purpose : To assess cognitive and visual outco...,Eva af Trampe,10
6,Tumour cell proliferation after failed rutheni...,https://onlinelibrary.wiley.com/doi/abs/10.111...,"S Seregard, G Lundell, I Lax…","Acta ophthalmologica …, 1997",Enucleation following ruthenium plaque radioth...,Eva af Trampe,10
7,Growth hormone producing pituitary adenomas wi...,https://www.sciencedirect.com/science/article/...,"S Werner, E af Trampe, P Palacios, I Lax…","International Journal of …, 1985",The effect of photon irradiation (50 Gy with a...,Eva af Trampe,10
8,External irradiation of growth hormone produci...,https://www.sciencedirect.com/science/article/...,"E af Trampe, G Lundell, I Lax, S Werner","International Journal of Radiation …, 1991",Fifty-six patients with acromegaly were treate...,Eva af Trampe,10
9,Long-term effects of radiotherapy and bromocri...,https://academic.oup.com/neurosurgery/article-...,"E Moberg, E af Trampe, J Wersäll, S Werner","Neurosurgery, 1991",The long-term effect of radiotherapy and bromo...,Eva af Trampe,10


Wait at least 30 minutes.