# Web scraping of Google Scholar: parts 5-8

The following code performs web scraping of Google Scholar using the Python libraries Beautiful Soup and Requests. Scraping papers and co-authors is our main objective.

In [1]:
import numpy as np
import pandas as pd

import requests
import time
from time import sleep
from datetime import datetime
from random import randint

import os

from bs4 import BeautifulSoup
import lxml

In [2]:
# Create results folder
route0 = "../web_scraping_data"

if not os.path.exists(route0):
    os.mkdir(route0)

## Load HCPs

In [2]:
# Load the file with HCP data
hcp_df = pd.read_excel("../novartis_data/BC & Melanoma targets Sweden.xlsx")

# Detect copyrights, it's separated by NA line
ind_cpr = hcp_df.loc[pd.isna(hcp_df["Name"]), :].index[0]

# Remove copyrights and duplicates
hcp_names_all = pd.Series(hcp_df["Name"].head(ind_cpr).unique())

## Querying Google Scholar using Beautiful Soup and Requests

Since we do the scraping of Google Scholar not in one block, but in 16 blocks of 13 HCPs each, we now write a function that does the scraping for any single of the 16 blocks.

In [5]:
def gs_bs_scraping(hcp_names, position):

    start_time_d = datetime.now()
    start_time_t = time.time()

    papers_df = []

    for i in range(len(hcp_names)): # Level: HCPs
        results_num = 0
        for j in [0,10,20]: # Level: Page
            sleep(randint(1,5))
            name_i = hcp_names.loc[i]
            query_i = '"{}"'.format(name_i) # put the doctor's name in quotes 
            print(f"{(position - 1)*len(hcp_names) + i + 1}/{len(hcp_names_all)} --- {query_i}")
            print(f"{j + 1}-{j + 10}")

            headers = {
            'User-agent':
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
            }

            proxies = {
            'http': os.getenv('HTTP_PROXY')
            }

            params = {
            "q": query_i, 
            "hl": "en",
            "start": j
            }

            html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params, proxies=proxies).text
        
            soup = BeautifulSoup(html, 'lxml')

            # Check how many articles there are
            results_i = soup.select('.gs_ri')
            results_num = results_num + len([x for x in results_i])

            if results_num > 0:
                results_i = soup.select('.gs_ri')
            else:
                results_i = []

            # Loop over retrieved articles
            papers_df_i = []
            for article in results_i: # Level: Articles
                try:
                    dict_i = {
                        "title": article.select_one('.gs_rt').text,
                        "title_link": article.select_one('.gs_rt a')['href'],
                        "authors" : article.select_one('.gs_a').text.split("- ")[0],
                        "publications" : article.select_one('.gs_a').text.split("- ")[1],
                        "snippet" : article.select_one('.gs_rs').text
                    }
                except:
                    pass

                df_i = pd.DataFrame.from_dict([dict_i])
                df_i["hcp_name"] = name_i
                df_i["num_articles"] = results_num

                papers_df_i.append(df_i)

            if len(papers_df_i) > 0:
                papers_df_i = pd.concat(papers_df_i)
                papers_df.append(papers_df_i)
            else:
                papers_df_i = pd.DataFrame({
                    "hcp_name": [name_i],
                    "num_articles": [results_num]
                })
                papers_df.append(papers_df_i)

        # Store data at the end
        if i == len(hcp_names) - 1:
            print(f"saving file corresponding to results_queries_gs_{position}.csv")
            papers_df = pd.concat(papers_df).reset_index(drop = True)
            papers_df.to_csv(f"{route0}/results_queries_gs_{position}.csv", index = False)
        
    end_time_d = datetime.now()
    end_time_t = time.time()
    duration = end_time_t - start_time_t
    print(f"start time: {start_time_d}")
    print(f"end time: {end_time_d}")
    print(f"duration: {int(np.floor(duration/3600))} hours {(duration-3600*np.floor(duration/3600))/60:.4} minutes")

### Part 5/16: Doctors 53-65

In [6]:
# 5th part
hcp_names = hcp_names_all[52:65].reset_index(drop = True)
hcp_names

0         Anna Askelin
1       Henrik Lindman
2      Ida Spång Rosén
3          Jamila Adra
4          Jan Frisell
5      Jenny Bergqvist
6        Johan Ahlgren
7        Johan Hartman
8          Jonas Bergh
9           Jonas Holm
10       Jörn Schneede
11       Judith Bjöhle
12    Karolina Larsson
dtype: object

In [9]:
# Run function
gs_bs_scraping(hcp_names, 5)

53/208 --- "Anna Askelin"
1-10
53/208 --- "Anna Askelin"
11-20
53/208 --- "Anna Askelin"
21-30
54/208 --- "Henrik Lindman"
1-10
54/208 --- "Henrik Lindman"
11-20
54/208 --- "Henrik Lindman"
21-30
55/208 --- "Ida Spång Rosén"
1-10
55/208 --- "Ida Spång Rosén"
11-20
55/208 --- "Ida Spång Rosén"
21-30
56/208 --- "Jamila Adra"
1-10
56/208 --- "Jamila Adra"
11-20
56/208 --- "Jamila Adra"
21-30
57/208 --- "Jan Frisell"
1-10
57/208 --- "Jan Frisell"
11-20
57/208 --- "Jan Frisell"
21-30
58/208 --- "Jenny Bergqvist"
1-10
58/208 --- "Jenny Bergqvist"
11-20
58/208 --- "Jenny Bergqvist"
21-30
59/208 --- "Johan Ahlgren"
1-10
59/208 --- "Johan Ahlgren"
11-20
59/208 --- "Johan Ahlgren"
21-30
60/208 --- "Johan Hartman"
1-10
60/208 --- "Johan Hartman"
11-20
60/208 --- "Johan Hartman"
21-30
61/208 --- "Jonas Bergh"
1-10
61/208 --- "Jonas Bergh"
11-20
61/208 --- "Jonas Bergh"
21-30
62/208 --- "Jonas Holm"
1-10
62/208 --- "Jonas Holm"
11-20
62/208 --- "Jonas Holm"
21-30
63/208 --- "Jörn Schneede"
1-10
63/

In [5]:
results_queries_gs_5 = pd.read_csv(f"{route0}/results_queries_gs_5.csv")
results_queries_gs_5

Unnamed: 0,hcp_name,num_articles,title,title_link,authors,publications,snippet
0,Anna Askelin,0,,,,,
1,Anna Askelin,0,,,,,
2,Anna Askelin,0,,,,,
3,Henrik Lindman,10,[HTML][HTML] Breast cancer in young women: poo...,https://journals.plos.org/plosone/article?id=1...,"H Fredholm, S Eaker, J Frisell, L Holmberg…","PloS one, 2009",Background Breast cancer is uncommon in young ...
4,Henrik Lindman,10,[HTML][HTML] Potent corticosteroid cream (mome...,https://www.sciencedirect.com/science/article/...,"Å Boström, H Lindman, C Swartling, B Berne…","Radiotherapy and …, 2001",Purpose: Radiation-induced dermatitis is a ver...
5,Henrik Lindman,10,First-in-human molecular imaging of HER2 expre...,https://jnm.snmjournals.org/content/55/5/730.s...,"J Sörensen, D Sandberg, M Sandström…","Journal of nuclear …, 2014",The expression status of human epidermal growt...
6,Henrik Lindman,10,"[HTML][HTML] Tailored fluorouracil, epirubicin...",https://www.sciencedirect.com/science/article/...,"J Bergh, T Wiklund, B Erikstein, E Lidbrink, H...","The Lancet, 2000",Background Chemotherapy drug distribution vari...
7,Henrik Lindman,10,Effect of adjuvant trastuzumab for a duration ...,https://jamanetwork.com/journals/jamaoncology/...,"H Joensuu, J Fraser, H Wildiers, R Huovinen…","JAMA …, 2018",Importance Trastuzumab plus chemotherapy is th...
8,Henrik Lindman,10,"[PDF][PDF] Adjuvant capecitabine, docetaxel, c...",https://www.academia.edu/download/51792387/Adj...,"H Joensuu, PL Kellokumpu-Lehtinen, R Huovinen…","J Clin Oncol, 2012","… Heikki Joensuu, Pirkko-Liisa Kellokumpu-Leht..."
9,Henrik Lindman,10,[HTML][HTML] Measuring HER2-receptor expressio...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,"J Sörensen, I Velikyan, D Sandberg, A Wennborg…","Theranostics, 2016",Purpose: Positron Emission Tomography (PET) im...


Wait at least 30 minutes.

### Part 6/16: Doctors 66-78

In [11]:
# 6th part
hcp_names = hcp_names_all[65:78].reset_index(drop = True)
hcp_names

0     Katalin Marianna Kovacs
1                   Anna Lind
2             Kenneth Villman
3              Carin Lundgren
4               Susanne Söder
5            Kilian Bachmeier
6            Kristina Engvall
7                    Eva Ulff
8                Lars Norberg
9                  Leif Klint
10       Lena Nittby Tennvall
11               Linda Thorén
12           Magnus Lagerlund
dtype: object

In [12]:
# Run function
gs_bs_scraping(hcp_names, 6)

66/208 --- "Katalin Marianna Kovacs"
1-10
66/208 --- "Katalin Marianna Kovacs"
11-20
66/208 --- "Katalin Marianna Kovacs"
21-30
67/208 --- "Anna Lind"
1-10
67/208 --- "Anna Lind"
11-20
67/208 --- "Anna Lind"
21-30
68/208 --- "Kenneth Villman"
1-10
68/208 --- "Kenneth Villman"
11-20
68/208 --- "Kenneth Villman"
21-30
69/208 --- "Carin Lundgren"
1-10
69/208 --- "Carin Lundgren"
11-20
69/208 --- "Carin Lundgren"
21-30
70/208 --- "Susanne Söder"
1-10
70/208 --- "Susanne Söder"
11-20
70/208 --- "Susanne Söder"
21-30
71/208 --- "Kilian Bachmeier"
1-10
71/208 --- "Kilian Bachmeier"
11-20
71/208 --- "Kilian Bachmeier"
21-30
72/208 --- "Kristina Engvall"
1-10
72/208 --- "Kristina Engvall"
11-20
72/208 --- "Kristina Engvall"
21-30
73/208 --- "Eva Ulff"
1-10
73/208 --- "Eva Ulff"
11-20
73/208 --- "Eva Ulff"
21-30
74/208 --- "Lars Norberg"
1-10
74/208 --- "Lars Norberg"
11-20
74/208 --- "Lars Norberg"
21-30
75/208 --- "Leif Klint"
1-10
75/208 --- "Leif Klint"
11-20
75/208 --- "Leif Klint"
21-30
76

In [6]:
results_queries_gs_6 = pd.read_csv(f"{route0}/results_queries_gs_6.csv")
results_queries_gs_6 

Unnamed: 0,hcp_name,num_articles,title,title_link,authors,publications,snippet
0,Katalin Marianna Kovacs,0,,,,,
1,Katalin Marianna Kovacs,0,,,,,
2,Katalin Marianna Kovacs,0,,,,,
3,Anna Lind,10,Impact of electro-acupuncture and physical exe...,https://journals.physiology.org/doi/abs/10.115...,"E Jedel, F Labrie, A Odén, G Holm…","American Journal …, 2011","Polycystic ovary syndrome (PCOS), the most com..."
4,Anna Lind,10,"Regional myo-inositol, creatine, and choline l...",https://www.jneurosci.org/content/40/42/8149.a...,"A Lind, CJ Boraxbekk, ET Petersen…","Journal of …, 2020",Proton MR spectroscopy ( 1 H-MRS) has been use...
5,Anna Lind,10,[HTML][HTML] Post combustion carbon capture wi...,https://www.sciencedirect.com/science/article/...,"S Krishnamurthy, A Lind, A Bouzga, J Pierchala…","Chemical Engineering …, 2021",Supported amine sorbents are extensively studi...
6,Anna Lind,10,[HTML][HTML] Multi-purpose structured catalyst...,https://www.sciencedirect.com/science/article/...,"A Lind, Ø Vistad, MF Sunding, KA Andreassen…","Materials & Design, 2020",This work presents an example of the design an...
7,Anna Lind,10,The water permeability channels aquaporins 1–4...,https://academic.oup.com/jcem/article-abstract...,"A Thoroddsen, P Dahm-Kähler, AK Lind…","The Journal of …, 2011",Context: Changes in vascular permeability and ...
8,Anna Lind,10,"Unusual, vesicle-like patterned, mesoscopicall...",https://pubs.acs.org/doi/abs/10.1021/cm021243o,"A Lind, B Spliethoff, M Lindén","Chemistry of materials, 2003",A mixture of cationic and anionic surfactants ...
9,Anna Lind,10,RUNX2 transcription factor regulates gene expr...,https://academic.oup.com/mend/article-abstract...,"ES Park, AK Lind, P Dahm-Kähler…","Molecular …, 2010",The LH surge promotes terminal differentiation...


Wait at least 30 minutes.

### Part 7/16: Doctors 79-91

In [14]:
# 7th part
hcp_names = hcp_names_all[78:91].reset_index(drop = True)
hcp_names

0     Malgorzata Drozd-Lula
1                  Mari Åhs
2              Maria Ekholm
3         Gudrun Linderkers
4           Karin Maltenius
5            Maria Svensson
6            Marie Klintman
7            Marie Zajicova
8           Karin Jannesson
9             Kicki Klaeson
10          Martin Malmberg
11               Mats Andén
12              Mihalj Seke
dtype: object

In [15]:
# Run function
gs_bs_scraping(hcp_names, 7)

79/208 --- "Malgorzata Drozd-Lula"
1-10
79/208 --- "Malgorzata Drozd-Lula"
11-20
79/208 --- "Malgorzata Drozd-Lula"
21-30
80/208 --- "Mari Åhs"
1-10
80/208 --- "Mari Åhs"
11-20
80/208 --- "Mari Åhs"
21-30
81/208 --- "Maria Ekholm"
1-10
81/208 --- "Maria Ekholm"
11-20
81/208 --- "Maria Ekholm"
21-30
82/208 --- "Gudrun Linderkers"
1-10
82/208 --- "Gudrun Linderkers"
11-20
82/208 --- "Gudrun Linderkers"
21-30
83/208 --- "Karin Maltenius"
1-10
83/208 --- "Karin Maltenius"
11-20
83/208 --- "Karin Maltenius"
21-30
84/208 --- "Maria Svensson"
1-10
84/208 --- "Maria Svensson"
11-20
84/208 --- "Maria Svensson"
21-30
85/208 --- "Marie Klintman"
1-10
85/208 --- "Marie Klintman"
11-20
85/208 --- "Marie Klintman"
21-30
86/208 --- "Marie Zajicova"
1-10
86/208 --- "Marie Zajicova"
11-20
86/208 --- "Marie Zajicova"
21-30
87/208 --- "Karin Jannesson"
1-10
87/208 --- "Karin Jannesson"
11-20
87/208 --- "Karin Jannesson"
21-30
88/208 --- "Kicki Klaeson"
1-10
88/208 --- "Kicki Klaeson"
11-20
88/208 --- "Ki

In [7]:
results_queries_gs_7 = pd.read_csv(f"{route0}/results_queries_gs_7.csv")
results_queries_gs_7

Unnamed: 0,title,title_link,authors,publications,snippet,hcp_name,num_articles
0,Primary leiomyosarcoma of the thoracic aorta. ...,https://search.proquest.com/openview/791d68ebc...,M Drozd-Lula,"Wspólczesna Onkologia, 2001",The subject of this article is a 73-year old w...,Malgorzata Drozd-Lula,3
1,Primary leiomyosarcoma of the thoracic aorta. ...,https://search.proquest.com/openview/791d68ebc...,M Drozd-Lula,"Wspólczesna Onkologia, 2001",The subject of this article is a 73-year old w...,Malgorzata Drozd-Lula,3
2,Primary leiomyosarcoma of the thoracic aorta. ...,https://search.proquest.com/openview/791d68ebc...,M Drozd-Lula,"Wspólczesna Onkologia, 2001",The subject of this article is a 73-year old w...,Malgorzata Drozd-Lula,3
3,,,,,,Malgorzata Drozd-Lula,3
4,,,,,,Malgorzata Drozd-Lula,3
5,[HTML][HTML] Summary of international recommen...,https://www.thelancet.com/journals/lanonc/arti...,"D Mauri, K Kamposioras, M Tolia, F Alongi…","The Lancet …, 2020",Patients with cancer are at high risk for seri...,Mari Åhs,5
6,Mechanisms of cell death of thymocytes induced...,https://onlinelibrary.wiley.com/doi/abs/10.100...,"M Åhs, A Prasad, Z Aminov…","Journal of cellular …, 2011",Polyunsaturated fatty acids (PUFAs) are rapidl...,Mari Åhs,5
7,Omega-3 and omega-6 fatty acids kill thymocyte...,https://benthamopen.com/ABSTRACT/TOCBJ-2-1,"A Prasad, M Åhs, A Goncharov…","The Open Cell …, 2010",Background: Omega-3 but not omega-6 fatty acid...,Mari Åhs,5
8,[PDF][PDF] Behind the numbers and the panic of...,https://jbuon.com/archive/25-3-1277.pdf,"D Mauri, D Tzachanis, A Valachis, K Kamposioras…","J BUON, 2020",To protect cancer patients from COVID-19 expos...,Mari Åhs,5
9,[PDF][PDF] Behind the numbers and the panic of...,https://jbuon.com/archive/25-3-1277.pdf,"D Mauri, D Tzachanis, A Valachis, K Kamposioras…","J BUON, 2020",To protect cancer patients from COVID-19 expos...,Mari Åhs,5


Wait at least 30 minutes.

### Part 8/16: Doctors 92-104

In [17]:
# 8th part
hcp_names = hcp_names_all[91:104].reset_index(drop = True)
hcp_names

0            Munila Mudaisi
1              Nils Wilking
2       Nils-Olof Bengtsson
3     Nina Letter Al-Ayoubi
4                 Pär Bodén
5            Paulina Krywda
6                 Pehr Lind
7               Per Byström
8                Per Edlund
9              Per Karlsson
10            Per Malmström
11     Kala Hatti Önnerfält
12       Anna-Karin Åkesson
dtype: object

In [20]:
# Run function
gs_bs_scraping(hcp_names, 8)

92/208 --- "Munila Mudaisi"
1-10
92/208 --- "Munila Mudaisi"
11-20
92/208 --- "Munila Mudaisi"
21-30
93/208 --- "Nils Wilking"
1-10
93/208 --- "Nils Wilking"
11-20
93/208 --- "Nils Wilking"
21-30
94/208 --- "Nils-Olof Bengtsson"
1-10
94/208 --- "Nils-Olof Bengtsson"
11-20
94/208 --- "Nils-Olof Bengtsson"
21-30
95/208 --- "Nina Letter Al-Ayoubi"
1-10
95/208 --- "Nina Letter Al-Ayoubi"
11-20
95/208 --- "Nina Letter Al-Ayoubi"
21-30
96/208 --- "Pär Bodén"
1-10
96/208 --- "Pär Bodén"
11-20
96/208 --- "Pär Bodén"
21-30
97/208 --- "Paulina Krywda"
1-10
97/208 --- "Paulina Krywda"
11-20
97/208 --- "Paulina Krywda"
21-30
98/208 --- "Pehr Lind"
1-10
98/208 --- "Pehr Lind"
11-20
98/208 --- "Pehr Lind"
21-30
99/208 --- "Per Byström"
1-10
99/208 --- "Per Byström"
11-20
99/208 --- "Per Byström"
21-30
100/208 --- "Per Edlund"
1-10
100/208 --- "Per Edlund"
11-20
100/208 --- "Per Edlund"
21-30
101/208 --- "Per Karlsson"
1-10
101/208 --- "Per Karlsson"
11-20
101/208 --- "Per Karlsson"
21-30
102/208 ---

In [8]:
results_queries_gs_8 = pd.read_csv(f"{route0}/results_queries_gs_8.csv")
results_queries_gs_8

Unnamed: 0,title,title_link,authors,publications,snippet,hcp_name,num_articles
0,[HTML][HTML] Disulfiram repurposing combined w...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,"AS Jakola, K Werlenius, M Mudaisi, S Hylin…","…, 2018",Background: Disulfiram (DSF) is a well-tolerat...,Munila Mudaisi,10
1,ABCB1 single-nucleotide variants and survival ...,https://www.nature.com/articles/s41397-019-0107-z,"A Malmström, M Łysiak, L Åkesson…","The …, 2020",Standard treatment for glioblastoma (GBM) pati...,Munila Mudaisi,10
2,[HTML][HTML] “Do I want to know it all?” A qua...,https://link.springer.com/article/10.1007/s005...,"A Malmström, L Åkesson, P Milos, M Mudaisi…","Supportive Care in …, 2021",Purpose Glioma patients have poor prognosis. T...,Munila Mudaisi,10
3,Deletions on Chromosome Y and Downregulation o...,https://www.mdpi.com/2072-6694/13/7/1619,"M Łysiak, A Smits, KR Roodakker, E Sandberg…","Cancers, 2021",Background: Biological causes of sex disparity...,Munila Mudaisi,10
4,Do I want to know it all?,https://www.diva-portal.org/smash/record.jsf?p...,"A Malmström, L Åkesson, P Milos, M Mudaisi…","Supportive Care in …, 2020",Purpose Glioma patients have poor prognosis. T...,Munila Mudaisi,10
5,[HTML][HTML] Assessment of genetic and non-gen...,https://www.sciencedirect.com/science/article/...,"MK Heenkenda, A Malmström, M Lysiak, M Mudaisi…","Thrombosis research, 2019",Introduction Venous thromboembolism (VTE) is a...,Munila Mudaisi,10
6,[PDF][PDF] controlled trial [version 1; refere...,https://www.researchgate.net/profile/Asgeir-Ja...,"AS Jakola, K Werlenius, M Mudaisi, S Hylin, S ...",2016,"Disulfiram (DSF) is a well-tolerated, inexpens...",Munila Mudaisi,10
7,ABO blood group is a potent risk factor for ve...,https://acsjournals.onlinelibrary.wiley.com/do...,"MB Streiff, J Segal, SA Grossman…","… Journal of the …, 2004",BACKGROUND Venous thromboembolism (VTE) is a c...,Munila Mudaisi,10
8,Impact of thrombophilic gene mutations on thro...,https://acsjournals.onlinelibrary.wiley.com/do...,"R Pihusch, G Danzl, M Scholz, D Harich, M Pihu...","Cancer, 2002",BACKGROUND Patients with malignancies have an ...,Munila Mudaisi,10
9,[HTML][HTML] Predictors of mosaic chromosome Y...,https://www.nature.com/articles/s41598-018-307...,"E Loftfield, W Zhou, BI Graubard, M Yeager…","Scientific reports, 2018",Mosaic loss of the Y chromosome (mLOY) is the ...,Munila Mudaisi,10


Wait at least 30 minutes.