# Web scraping of Google Scholar: parts 13-16

The following code performs web scraping of Google Scholar using the Python libraries Beautiful Soup and Requests. Scraping papers and co-authors is our main objective.

In [1]:
import numpy as np
import pandas as pd

import requests
import time
from time import sleep
from datetime import datetime
from random import randint

import os

from bs4 import BeautifulSoup
import lxml

In [2]:
# Create results folder
route0 = "../web_scraping_data"

if not os.path.exists(route0):
    os.mkdir(route0)

## Load HCPs

In [3]:
# Load the file with HCP data
hcp_df = pd.read_excel("../novartis_data/BC & Melanoma targets Sweden.xlsx")

# Detect copyrights, it's separated by NA line
ind_cpr = hcp_df.loc[pd.isna(hcp_df["Name"]), :].index[0]

# Remove copyrights and duplicates
hcp_names_all = pd.Series(hcp_df["Name"].head(ind_cpr).unique())

## Querying Google Scholar using Beautiful Soup and Requests

Since we do the scraping of Google Scholar not in one block, but in 16 blocks of 13 HCPs each, we now write a function that does the scraping for any single of the 16 blocks.

In [5]:
def gs_bs_scraping(hcp_names, position):

    start_time_d = datetime.now()
    start_time_t = time.time()

    papers_df = []

    for i in range(len(hcp_names)): # Level: HCPs
        results_num = 0
        for j in [0,10,20]: # Level: Page
            sleep(randint(1,5))
            name_i = hcp_names.loc[i]
            query_i = '"{}"'.format(name_i) # put the doctor's name in quotes 
            print(f"{(position - 1)*len(hcp_names) + i + 1}/{len(hcp_names_all)} --- {query_i}")
            print(f"{j + 1}-{j + 10}")

            headers = {
            'User-agent':
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
            }

            proxies = {
            'http': os.getenv('HTTP_PROXY')
            }

            params = {
            "q": query_i, 
            "hl": "en",
            "start": j
            }

            html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params, proxies=proxies).text
        
            soup = BeautifulSoup(html, 'lxml')

            # Check how many articles there are
            results_i = soup.select('.gs_ri')
            results_num = results_num + len([x for x in results_i])

            if results_num > 0:
                results_i = soup.select('.gs_ri')
            else:
                results_i = []

            # Loop over retrieved articles
            papers_df_i = []
            for article in results_i: # Level: Articles
                try:
                    dict_i = {
                        "title": article.select_one('.gs_rt').text,
                        "title_link": article.select_one('.gs_rt a')['href'],
                        "authors" : article.select_one('.gs_a').text.split("- ")[0],
                        "publications" : article.select_one('.gs_a').text.split("- ")[1],
                        "snippet" : article.select_one('.gs_rs').text
                    }
                except:
                    pass

                df_i = pd.DataFrame.from_dict([dict_i])
                df_i["hcp_name"] = name_i
                df_i["num_articles"] = results_num

                papers_df_i.append(df_i)

            if len(papers_df_i) > 0:
                papers_df_i = pd.concat(papers_df_i)
                papers_df.append(papers_df_i)
            else:
                papers_df_i = pd.DataFrame({
                    "hcp_name": [name_i],
                    "num_articles": [results_num]
                })
                papers_df.append(papers_df_i)

        # Store data at the end
        if i == len(hcp_names) - 1:
            print(f"saving file corresponding to results_queries_gs_{position}.csv")
            papers_df = pd.concat(papers_df).reset_index(drop = True)
            papers_df.to_csv(f"{route0}/results_queries_gs_{position}.csv", index = False)
        
    end_time_d = datetime.now()
    end_time_t = time.time()
    duration = end_time_t - start_time_t
    print(f"start time: {start_time_d}")
    print(f"end time: {end_time_d}")
    print(f"duration: {int(np.floor(duration/3600))} hours {(duration-3600*np.floor(duration/3600))/60:.4} minutes")

### Part 13/16: Doctors 157-169

In [6]:
# 13th part
hcp_names = hcp_names_all[156:169].reset_index(drop = True)
hcp_names

0               Jenny Furubrand
1              Lena C Andersson
2               Karin Törnkvist
3                Sofia Wikström
4               Lena Westerberg
5                  Eva Lindblad
6             Liselott Sahlberg
7                  Per Nodbrant
8               Marie Johansson
9                   Sofie Dietl
10    Elisabet Olsson Kivipaasi
11                Karin Nilsson
12          Marianne Pettersson
dtype: object

In [13]:
# Run function
gs_bs_scraping(hcp_names, 13)

157/208 --- "Jenny Furubrand"
1-10
157/208 --- "Jenny Furubrand"
11-20
157/208 --- "Jenny Furubrand"
21-30
158/208 --- "Lena C Andersson"
1-10
158/208 --- "Lena C Andersson"
11-20
158/208 --- "Lena C Andersson"
21-30
159/208 --- "Karin Törnkvist"
1-10
159/208 --- "Karin Törnkvist"
11-20
159/208 --- "Karin Törnkvist"
21-30
160/208 --- "Sofia Wikström"
1-10
160/208 --- "Sofia Wikström"
11-20
160/208 --- "Sofia Wikström"
21-30
161/208 --- "Lena Westerberg"
1-10
161/208 --- "Lena Westerberg"
11-20
161/208 --- "Lena Westerberg"
21-30
162/208 --- "Eva Lindblad"
1-10
162/208 --- "Eva Lindblad"
11-20
162/208 --- "Eva Lindblad"
21-30
163/208 --- "Liselott Sahlberg"
1-10
163/208 --- "Liselott Sahlberg"
11-20
163/208 --- "Liselott Sahlberg"
21-30
164/208 --- "Per Nodbrant"
1-10
164/208 --- "Per Nodbrant"
11-20
164/208 --- "Per Nodbrant"
21-30
165/208 --- "Marie Johansson"
1-10
165/208 --- "Marie Johansson"
11-20
165/208 --- "Marie Johansson"
21-30
166/208 --- "Sofie Dietl"
1-10
166/208 --- "Sofie

In [5]:
results_queries_gs_13 = pd.read_csv(f"{route0}/results_queries_gs_13.csv")
results_queries_gs_13

Unnamed: 0,title,title_link,authors,publications,snippet,hcp_name,num_articles
0,"[HTML][HTML] Palliativ vård, cytostatikabehand...",https://www.diva-portal.org/smash/record.jsf?p...,"J Furubrand, E Johansson",2017,"Palliativ vård, cytostatikabehandling och bryt...",Jenny Furubrand,1
1,,,,,,Jenny Furubrand,1
2,,,,,,Jenny Furubrand,1
3,Our experience using the vertical rectus abdom...,https://www.tandfonline.com/doi/abs/10.1080/02...,"JHW Clarkson, F Probst, NS Niranjan…","… journal of plastic and …, 2003",The vertical rectus abdominis (VRAM) flap has ...,Lena C Andersson,5
4,Escherichia coli high cell density fed batch c...,https://elibrary.ru/item.asp?id=6880657,LC Andersson,1998,Degree: Takn. dr DegreeYear: 1996 Institute: K...,Lena C Andersson,5
5,[PDF][PDF] Intensive Care of Burns Patients,http://ndl.ethernet.edu.et/bitstream/123456789...,"LC Andersson, HC Nettelblad…","Clinical Intensive Care …, 2015",The treatment of burns has improved dramatical...,Lena C Andersson,5
6,[PDF][PDF] Intensive Care of Burns Patients,http://ndl.ethernet.edu.et/bitstream/123456789...,"LC Andersson, HC Nettelblad…","Clinical Intensive Care …, 2015",The treatment of burns has improved dramatical...,Lena C Andersson,5
7,[BOOK][B] Clinical Intensive Care Medicine,https://books.google.com/books?hl=en&lr=&id=0t...,CMH Gómez,2014,Intensive care patients are the most criticall...,Lena C Andersson,5
8,,,,,,Lena C Andersson,5
9,,,,,,Lena C Andersson,5


Wait at least 30 minutes.

### Part 14/16: Doctors 170-182

In [15]:
# 14th part
hcp_names = hcp_names_all[169:182].reset_index(drop = True)
hcp_names

0         Marinette Berglund
1     Sara Margareta Ekenbro
2            Lena Samuelsson
3         Margareth Schoultz
4              Hervor Gramén
5             Mari Johansson
6           Karin Samuelsson
7         Lise-Lotte Jönsson
8                Malin Ståhl
9                  Lena Berg
10           Camilla Persson
11              Marie Boberg
12         Pernilla Karlsson
dtype: object

In [16]:
# Run function
gs_bs_scraping(hcp_names, 14)

170/208 --- "Marinette Berglund"
1-10
170/208 --- "Marinette Berglund"
11-20
170/208 --- "Marinette Berglund"
21-30
171/208 --- "Sara Margareta Ekenbro"
1-10
171/208 --- "Sara Margareta Ekenbro"
11-20
171/208 --- "Sara Margareta Ekenbro"
21-30
172/208 --- "Lena Samuelsson"
1-10
172/208 --- "Lena Samuelsson"
11-20
172/208 --- "Lena Samuelsson"
21-30
173/208 --- "Margareth Schoultz"
1-10
173/208 --- "Margareth Schoultz"
11-20
173/208 --- "Margareth Schoultz"
21-30
174/208 --- "Hervor Gramén"
1-10
174/208 --- "Hervor Gramén"
11-20
174/208 --- "Hervor Gramén"
21-30
175/208 --- "Mari Johansson"
1-10
175/208 --- "Mari Johansson"
11-20
175/208 --- "Mari Johansson"
21-30
176/208 --- "Karin Samuelsson"
1-10
176/208 --- "Karin Samuelsson"
11-20
176/208 --- "Karin Samuelsson"
21-30
177/208 --- "Lise-Lotte Jönsson"
1-10
177/208 --- "Lise-Lotte Jönsson"
11-20
177/208 --- "Lise-Lotte Jönsson"
21-30
178/208 --- "Malin Ståhl"
1-10
178/208 --- "Malin Ståhl"
11-20
178/208 --- "Malin Ståhl"
21-30
179/208

In [6]:
results_queries_gs_14 = pd.read_csv(f"{route0}/results_queries_gs_14.csv")
results_queries_gs_14

Unnamed: 0,title,title_link,authors,publications,snippet,hcp_name,num_articles
0,Objective measurements of radiotherapy‐induced...,https://onlinelibrary.wiley.com/doi/abs/10.111...,"J Nyström, P Geladi…","Skin Research and …, 2004",… The European Union Structure Foundation Obje...,Marinette Berglund,1
1,,,,,,Marinette Berglund,1
2,,,,,,Marinette Berglund,1
3,,,,,,Sara Margareta Ekenbro,0
4,,,,,,Sara Margareta Ekenbro,0
5,,,,,,Sara Margareta Ekenbro,0
6,[HTML][HTML] Genome-wide association study ide...,https://www.ncbi.nlm.nih.gov/pmc/articles/pmc3...,"A Strange, F Capon, CCA Spencer, J Knight…","Nature …, 2010",(WTCCC2) consortia is provided in the Suppleme...,Lena Samuelsson,10
7,[HTML][HTML] Identification of 15 new psoriasi...,https://www.nature.com/articles/ng.2467,"LC Tsoi, SL Spain, J Knight, E Ellinghaus, PE ...","Nature …, 2012",To gain further insight into the genetic archi...,Lena Samuelsson,10
8,Stress and well‐being among parents of childre...,https://onlinelibrary.wiley.com/doi/abs/10.111...,"L Dellve, L Samuelsson, A Tallborn…","Journal of advanced …, 2006",Aim. This paper reports a study to assess stre...,Lena Samuelsson,10
9,[HTML][HTML] SLC9A6 mutations cause X-linked m...,https://www.sciencedirect.com/science/article/...,"GD Gilfillan, KK Selmer, I Roxrud, R Smith…","The American Journal of …, 2008",Linkage analysis and DNA sequencing in a famil...,Lena Samuelsson,10


Wait at least 30 minutes.

### Part 15/16: Doctors 183-195

In [18]:
# 15th part
hcp_names = hcp_names_all[182:195].reset_index(drop = True)
hcp_names

0          Lotta Henriksson
1           Karin Johansson
2        Charlotta Pramsten
3             Lola Svensson
4             Daniel Giglio
5        Elsy-Britt Schildt
6     Erika Isaksson Friman
7     Eva Djureen Mårtenson
8                   Lars Ny
9          Harriet Axelsson
10          Gustav Silander
11               Lisa Rydén
12           Carina Larsson
dtype: object

In [19]:
# Run function
gs_bs_scraping(hcp_names, 15)

183/208 --- "Lotta Henriksson"
1-10
183/208 --- "Lotta Henriksson"
11-20
183/208 --- "Lotta Henriksson"
21-30
184/208 --- "Karin Johansson"
1-10
184/208 --- "Karin Johansson"
11-20
184/208 --- "Karin Johansson"
21-30
185/208 --- "Charlotta Pramsten"
1-10
185/208 --- "Charlotta Pramsten"
11-20
185/208 --- "Charlotta Pramsten"
21-30
186/208 --- "Lola Svensson"
1-10
186/208 --- "Lola Svensson"
11-20
186/208 --- "Lola Svensson"
21-30
187/208 --- "Daniel Giglio"
1-10
187/208 --- "Daniel Giglio"
11-20
187/208 --- "Daniel Giglio"
21-30
188/208 --- "Elsy-Britt Schildt"
1-10
188/208 --- "Elsy-Britt Schildt"
11-20
188/208 --- "Elsy-Britt Schildt"
21-30
189/208 --- "Erika Isaksson Friman"
1-10
189/208 --- "Erika Isaksson Friman"
11-20
189/208 --- "Erika Isaksson Friman"
21-30
190/208 --- "Eva Djureen Mårtenson"
1-10
190/208 --- "Eva Djureen Mårtenson"
11-20
190/208 --- "Eva Djureen Mårtenson"
21-30
191/208 --- "Lars Ny"
1-10
191/208 --- "Lars Ny"
11-20
191/208 --- "Lars Ny"
21-30
192/208 --- "Har

In [7]:
results_queries_gs_15 = pd.read_csv(f"{route0}/results_queries_gs_15.csv")
results_queries_gs_15

Unnamed: 0,title,title_link,authors,publications,snippet,hcp_name,num_articles
0,Idiotype immunization combined with granulocyt...,https://ashpublications.org/blood/article-abst...,"A Österborg, Q Yi, L Henriksson…","Blood, The Journal …, 1998",Idiotypic structures expressed on the myeloma ...,Lotta Henriksson,10
1,[HTML][HTML] Augmentation of the immune respon...,https://journals.lww.com/co-hematology/fulltex...,"H Mellstedt, J Fagerberg, JE Frödin…","Current opinion in …, 1999",Granulocyte-macrophage colony-stimulating fact...,Lotta Henriksson,10
2,Idiotype immunity (natural and vaccine-induced...,https://www.tandfonline.com/doi/abs/10.1080/02...,"A OÈsterborg, L Henriksson, H Mellstedt","Acta Oncologica, 2000",Idiotypic structures expressed on the myeloma ...,Lotta Henriksson,10
3,Ravitsemuksen ja liikunnan vaikutukset hedelmä...,https://www.theseus.fi/handle/10024/97705,"L Henriksson, L Starmans",2015,Opinnäytetyö toteutettiin kirjallisuuskatsauks...,Lotta Henriksson,10
4,Ravitsemuksen ja liikunnan vaikutukset hedelmä...,https://www.theseus.fi/handle/10024/97705,"L Henriksson, L Starmans",2015,Opinnäytetyö toteutettiin kirjallisuuskatsauks...,Lotta Henriksson,10
5,Ravitsemuksen ja liikunnan vaikutukset hedelmä...,https://www.theseus.fi/handle/10024/97705,"L Henriksson, L Starmans",2015,Opinnäytetyö toteutettiin kirjallisuuskatsauks...,Lotta Henriksson,10
6,[PDF][PDF] Föredragningslista Föredragande,https://www.uddevalla.se/download/18.6ccd0e1f1...,OA Högberg,2015,Ett medborgarförslag har inkommit från Claes G...,Lotta Henriksson,10
7,Att bli pensionär: övergången från yrkesroll t...,https://www.diva-portal.org/smash/record.jsf?p...,"R Fara, M Weinmann",2006,Denna uppsats handlar om hur anpassningen till...,Lotta Henriksson,10
8,” Jag skulle inte vilja dö ensam”: sjukhemsper...,https://www.diva-portal.org/smash/record.jsf?p...,"J Boij, G Hedlund",2005,"Det blir allt fler av kategorin äldre idag, oc...",Lotta Henriksson,10
9,Genetic immunotherapy of established tumors wi...,https://www.liebertpub.com/doi/abs/10.1089/hum...,"CT Lee, S Wu, IF Ciernik, H Chen…","Human gene …, 1997",Increased local production of granulocyte-macr...,Lotta Henriksson,10


Wait at least 30 minutes.

### Part 16/16: Doctors 196-208

In [21]:
# 16th part
hcp_names = hcp_names_all[195:208].reset_index(drop = True)
hcp_names

0             Karin Lycknert
1               Niklas Loman
2         Olga Del Val Munoz
3          Olof Bjarnadottir
4            Samuel Rotstein
5               Sara Kinhult
6         Theodoros Foukakis
7            Maria Gränström
8     Ylva Holmgren Stenlund
9                Anna Nyberg
10           Göran Carlstedt
11          Mikael Wallander
12           Frida Jakobsson
dtype: object

In [22]:
# Run function
gs_bs_scraping(hcp_names, 16)

196/208 --- "Karin Lycknert"
1-10
196/208 --- "Karin Lycknert"
11-20
196/208 --- "Karin Lycknert"
21-30
197/208 --- "Niklas Loman"
1-10
197/208 --- "Niklas Loman"
11-20
197/208 --- "Niklas Loman"
21-30
198/208 --- "Olga Del Val Munoz"
1-10
198/208 --- "Olga Del Val Munoz"
11-20
198/208 --- "Olga Del Val Munoz"
21-30
199/208 --- "Olof Bjarnadottir"
1-10
199/208 --- "Olof Bjarnadottir"
11-20
199/208 --- "Olof Bjarnadottir"
21-30
200/208 --- "Samuel Rotstein"
1-10
200/208 --- "Samuel Rotstein"
11-20
200/208 --- "Samuel Rotstein"
21-30
201/208 --- "Sara Kinhult"
1-10
201/208 --- "Sara Kinhult"
11-20
201/208 --- "Sara Kinhult"
21-30
202/208 --- "Theodoros Foukakis"
1-10
202/208 --- "Theodoros Foukakis"
11-20
202/208 --- "Theodoros Foukakis"
21-30
203/208 --- "Maria Gränström"
1-10
203/208 --- "Maria Gränström"
11-20
203/208 --- "Maria Gränström"
21-30
204/208 --- "Ylva Holmgren Stenlund"
1-10
204/208 --- "Ylva Holmgren Stenlund"
11-20
204/208 --- "Ylva Holmgren Stenlund"
21-30
205/208 --- "

In [8]:
results_queries_gs_16 = pd.read_csv(f"{route0}/results_queries_gs_16.csv")
results_queries_gs_16

Unnamed: 0,hcp_name,num_articles,title,title_link,authors,publications,snippet
0,Karin Lycknert,0,,,,,
1,Karin Lycknert,0,,,,,
2,Karin Lycknert,0,,,,,
3,Niklas Loman,10,[HTML][HTML] Average risks of breast and ovari...,https://www.sciencedirect.com/science/article/...,"A Antoniou, PDP Pharoah, S Narod, HA Risch…","The American Journal of …, 2003",Germline mutations in BRCA1 and BRCA2 confer h...
4,Niklas Loman,10,[HTML][HTML] Gene-expression profiles in hered...,https://www.nejm.org/doi/full/10.1056/nejm2001...,"I Hedenfalk, D Duggan, Y Chen…","… England Journal of …, 2001",Background Many cases of hereditary breast can...
5,Niklas Loman,10,[HTML][HTML] Oral poly (ADP-ribose) polymerase...,https://www.sciencedirect.com/science/article/...,"A Tutt, M Robson, JE Garber, SM Domchek, MW Au...","The Lancet, 2010","Background Olaparib, a novel, orally active po..."
6,Niklas Loman,10,[HTML][HTML] Oral poly (ADP-ribose) polymerase...,https://www.sciencedirect.com/science/article/...,"MW Audeh, J Carmichael, RT Penson, M Friedland...","The lancet, 2010","Background Olaparib is a novel, orally active ..."
7,Niklas Loman,10,[HTML][HTML] Olaparib monotherapy in patients ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,"B Kaufman, R Shapira-Frommer…","Journal of clinical …, 2015","… Stemmer, Ayala Hubert, Ora Rosengarten, Mari..."
8,Niklas Loman,10,A locus on 19p13 modifies risk of breast cance...,https://www.nature.com/articles/ng.669?message...,"AC Antoniou, X Wang, ZS Fredericksen, L McGuff...","Nature …, 2010",Germline BRCA1 mutations predispose to breast ...
9,Niklas Loman,10,Association of type and location of BRCA1 and ...,https://jamanetwork.com/journals/jama/article-...,"TR Rebbeck, N Mitra, F Wan, OM Sinilnikova, S ...","Jama, 2015",Importance Limited information about the relat...


Wait at least 30 minutes.

## Stack all 16 dataset and save the final dataset 

Having scraped Google Scholar for all 16 blocks, we finally stack all 16 dataset that we obtained and save it as one dataset.

In [9]:
df_names = []
for i in range(16):
    df_names = df_names + [f"results_queries_gs_{i + 1}"]
    
df_dict = dict()
for df_name in df_names:
    df_dict[df_name] = pd.read_csv(f"{route0}/{df_name}.csv")

In [10]:
df_dict['results_queries_gs_1']

Unnamed: 0,hcp_name,num_articles,title,title_link,authors,publications,snippet
0,Adel Bader Hamdalla,0,,,,,
1,Adel Bader Hamdalla,0,,,,,
2,Adel Bader Hamdalla,0,,,,,
3,Aglaia Schiza,10,"Neoadjuvant trastuzumab, pertuzumab, and docet...",https://jamanetwork.com/journals/jamaoncology/...,"T Hatschek, T Foukakis, J Bjöhle, T Lekberg…","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...
4,Aglaia Schiza,10,[HTML][HTML] Adenovirus-mediated CD40L gene tr...,https://translational-medicine.biomedcentral.c...,"A Schiza, J Wenthe, S Mangsbo…","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...
5,Aglaia Schiza,10,High PDGFRb Expression Predicts Resistance to ...,https://clincancerres.aacrjournals.org/content...,"C Strell, D Folkvaljon, E Holmberg, A Schiza…","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...
6,Aglaia Schiza,10,[HTML][HTML] Evaluation of diffusion-weighted ...,https://www.nature.com/articles/s41598-019-544...,"A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog…","Scientific reports, 2019",The purpose was to evaluate the potential of d...
7,Aglaia Schiza,10,[HTML][HTML] Local irradiation does not enhanc...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,"S Irenaeus, A Schiza, SM Mangsbo, J Wenthe…","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...
8,Aglaia Schiza,10,[HTML][HTML] Predictive role of HER2-status on...,https://link.springer.com/article/10.1007/s105...,"A Schiza, D Mauri, I Fredriksson, AK Wennstig…","Breast Cancer Research …, 2021",Purpose There are conflicting results on the p...
9,Aglaia Schiza,10,Abstract PO-018: Inflaming advanced solid tumo...,https://cancerres.aacrjournals.org/content/81/...,"J Wenthe, E Eriksson, L Sandin, T Lövgren, JL ...",2021,Pancreatic ductal adenocarcinoma (PDAC) is res...


In [11]:
for value in df_dict:
    df_dict[value] = df_dict[value][['hcp_name', 'num_articles', 'title', 'title_link', 'authors', 'publications', 'snippet']]

In [12]:
results_queries_gs = pd.DataFrame()

for i in range(16):
    results_queries_gs = results_queries_gs.append(df_dict[f"results_queries_gs_{i + 1}"], ignore_index=True)
    
results_queries_gs

Unnamed: 0,hcp_name,num_articles,title,title_link,authors,publications,snippet
0,Adel Bader Hamdalla,0,,,,,
1,Adel Bader Hamdalla,0,,,,,
2,Adel Bader Hamdalla,0,,,,,
3,Aglaia Schiza,10,"Neoadjuvant trastuzumab, pertuzumab, and docet...",https://jamanetwork.com/journals/jamaoncology/...,"T Hatschek, T Foukakis, J Bjöhle, T Lekberg…","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...
4,Aglaia Schiza,10,[HTML][HTML] Adenovirus-mediated CD40L gene tr...,https://translational-medicine.biomedcentral.c...,"A Schiza, J Wenthe, S Mangsbo…","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...
5,Aglaia Schiza,10,High PDGFRb Expression Predicts Resistance to ...,https://clincancerres.aacrjournals.org/content...,"C Strell, D Folkvaljon, E Holmberg, A Schiza…","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...
6,Aglaia Schiza,10,[HTML][HTML] Evaluation of diffusion-weighted ...,https://www.nature.com/articles/s41598-019-544...,"A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog…","Scientific reports, 2019",The purpose was to evaluate the potential of d...
7,Aglaia Schiza,10,[HTML][HTML] Local irradiation does not enhanc...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,"S Irenaeus, A Schiza, SM Mangsbo, J Wenthe…","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...
8,Aglaia Schiza,10,[HTML][HTML] Predictive role of HER2-status on...,https://link.springer.com/article/10.1007/s105...,"A Schiza, D Mauri, I Fredriksson, AK Wennstig…","Breast Cancer Research …, 2021",Purpose There are conflicting results on the p...
9,Aglaia Schiza,10,Abstract PO-018: Inflaming advanced solid tumo...,https://cancerres.aacrjournals.org/content/81/...,"J Wenthe, E Eriksson, L Sandin, T Lövgren, JL ...",2021,Pancreatic ductal adenocarcinoma (PDAC) is res...


In [8]:
print(f"saving file corresponding to results_queries_gs.csv")
results_queries_gs.to_csv(f"{route0}/results_queries_gs.csv", index = False)

saving file corresponding to results_queries_gs.csv


In [13]:
# Read in data scraped from Google Scholar
results_queries_gs = pd.read_csv(f"{route0}/results_queries_gs.csv")
results_queries_gs.shape    

(3778, 7)

## Results

In [14]:
results_queries_gs

Unnamed: 0,hcp_name,num_articles,title,title_link,authors,publications,snippet
0,Adel Bader Hamdalla,0,,,,,
1,Adel Bader Hamdalla,0,,,,,
2,Adel Bader Hamdalla,0,,,,,
3,Aglaia Schiza,10,"Neoadjuvant trastuzumab, pertuzumab, and docet...",https://jamanetwork.com/journals/jamaoncology/...,"T Hatschek, T Foukakis, J Bjöhle, T Lekberg…","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...
4,Aglaia Schiza,10,[HTML][HTML] Adenovirus-mediated CD40L gene tr...,https://translational-medicine.biomedcentral.c...,"A Schiza, J Wenthe, S Mangsbo…","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...
5,Aglaia Schiza,10,High PDGFRb Expression Predicts Resistance to ...,https://clincancerres.aacrjournals.org/content...,"C Strell, D Folkvaljon, E Holmberg, A Schiza…","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...
6,Aglaia Schiza,10,[HTML][HTML] Evaluation of diffusion-weighted ...,https://www.nature.com/articles/s41598-019-544...,"A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog…","Scientific reports, 2019",The purpose was to evaluate the potential of d...
7,Aglaia Schiza,10,[HTML][HTML] Local irradiation does not enhanc...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,"S Irenaeus, A Schiza, SM Mangsbo, J Wenthe…","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...
8,Aglaia Schiza,10,[HTML][HTML] Predictive role of HER2-status on...,https://link.springer.com/article/10.1007/s105...,"A Schiza, D Mauri, I Fredriksson, AK Wennstig…","Breast Cancer Research …, 2021",Purpose There are conflicting results on the p...
9,Aglaia Schiza,10,Abstract PO-018: Inflaming advanced solid tumo...,https://cancerres.aacrjournals.org/content/81/...,"J Wenthe, E Eriksson, L Sandin, T Lövgren, JL ...",2021,Pancreatic ductal adenocarcinoma (PDAC) is res...
