# Web Scraping of PubMed

In this notebook, we perform web scraping of PubMed using the Python library PyMed. Our primary goal is to scrape papers and co-authors.

In [1]:
import pandas as pd
import numpy as np

import requests
import time
import itertools

import matplotlib.pyplot as plt
import json
import os

from pymed import PubMed  

In [2]:
# Create results folder
route0 = "../web_scraping_data"

if not os.path.exists(route0):
    os.mkdir(route0)

## Load HCPs

In [3]:
# Load the file with HCP data
hcp_df = pd.read_excel("../novartis_data/BC & Melanoma targets Sweden.xlsx")

# Detect copyrights, it's separated by NA line
ind_cpr = hcp_df.loc[pd.isna(hcp_df["Name"]), :].index[0]

# Remove copyrights and duplicates
hcp_names = pd.Series(hcp_df["Name"].head(ind_cpr).unique())

## Querying of PubMed using PyMed

In [4]:
# Initialization of API object
pubmed = PubMed(tool="MyTool", email="my@email.address")

In [9]:
start_time = time.time()

n_max = 100

papers_df = []

for i in range(len(hcp_names)): # Level: HCPs
    
    name_i = hcp_names.loc[i]
    query_i = f"{name_i}"
    print(f"{i + 1}/{len(hcp_names)} --- {query_i}")
    
    # Check how many articles there are
    results_i = pubmed.query(query_i, max_results=n_max)
    results_num = len([x for x in results_i])
    
    if results_num > 0:
        results_i = pubmed.query(query_i, max_results=n_max)
    else:
        results_i = []
        
    # Loop over the retrieved articles
    papers_df_i = []
    for article in results_i: # Level: Articles per HCP
        dict_i = json.loads(article.toJSON())

        # Drop unnecessary columns
        if "xml" in dict_i.keys():
            del dict_i["xml"]
        if "doi" in dict_i.keys():
            del dict_i["doi"]

        df_i = pd.DataFrame.from_dict([dict_i])
        df_i["hcp_name"] = name_i
        df_i["num_articles"] = results_num
        
        #extract co-authors
        first_name = []
        last_name = []
        
        for j in dict_i["authors"]:
            first_name.append(j['firstname'])
            last_name.append(j['lastname'])
        names = list(zip(first_name, last_name))
        names = str(names)
        df_i["authors"] = names

        papers_df_i.append(df_i) # this df now contains all the papers of one HCP
        
    if len(papers_df_i) > 0:
        papers_df_i = pd.concat(papers_df_i)
        papers_df.append(papers_df_i)
    else:
        papers_df_i = pd.DataFrame({
            "hcp_name": [name_i],
            "num_articles": [results_num]
        })
        papers_df.append(papers_df_i)
        
    # Store data at the end
    if i == len(hcp_names) - 1:
        print("saving file corresponding to results_queries_pm.csv")
        papers_df = pd.concat(papers_df).reset_index(drop = True)
        papers_df.to_csv(f"{route0}/results_queries_pm.csv", index = False)
        

end_time = time.time()
duration = end_time - start_time
print(f"{int(np.floor(duration/3600))} hours {(duration-3600*np.floor(duration/3600))/60:.4} minutes")

1/208 --- Adel Bader Hamdalla
2/208 --- Aglaia Schiza
3/208 --- Agneta Nordin Danfors
4/208 --- Ahmed Abbas Albu-Kareem
5/208 --- Alaa Haidar
6/208 --- Ana Bosch Campos
7/208 --- Andreas Nearchou
8/208 --- Ulrika Bergqvist
9/208 --- Ann Charlotte Dreifaldt
10/208 --- Elisabeth Ryd Ausén
11/208 --- Marie Santonsson
12/208 --- Anna Nordenskjöld
13/208 --- Anna von Wachenfeldt Väppling
14/208 --- Anna-Karin Tzikas
15/208 --- Anna-Karin Wennstig
16/208 --- Anna Maria Hasselgren Häll
17/208 --- Anne-Kristine Andersson
18/208 --- Antonios Valachis
19/208 --- Elisabet Karlsson
20/208 --- Barbro Linderholm
21/208 --- Birgitta Lind
22/208 --- Kristina Lindblom
23/208 --- Maria Sandström
24/208 --- Cecilia Graffman
25/208 --- Cecilia Nilsson
26/208 --- Chaido Chamalidou
27/208 --- Charlotte Bratthäll
28/208 --- Christina Haapaniemi Olsson
29/208 --- Christina Linder Stragliotto
30/208 --- Malin Steenhoff
31/208 --- Therése Widerberg
32/208 --- Claudia Lundgren
33/208 --- Dan Lundstedt
34/208 ---

## Results

In [5]:
results_queries_pm = pd.read_csv(f"{route0}/results_queries_pm.csv")
results_queries_pm

Unnamed: 0,hcp_name,num_articles,abstract,authors,conclusions,copyrights,journal,keywords,methods,publication_date,pubmed_id,results,title
0,Adel Bader Hamdalla,0,,,,,,,,,,,
1,Aglaia Schiza,6,The immune microenvironment is an important mo...,"[('Aglaia', 'Schiza'), ('Viktoria', 'Thurfjell...",High TILs are associated with higher IBE risk ...,Copyright © 2022 The Author(s). Published by E...,"European journal of cancer (Oxford, England : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",,2022-03-04,35236568,Most women (61.9%) showed a TILs prevalence of...,Tumour-infiltrating lymphocytes add prognostic...
2,Aglaia Schiza,6,Trastuzumab emtansine (T-DM1) is presently app...,"[('Thomas', 'Hatschek'), ('Theodoros', 'Foukak...",,,JAMA oncology,[],,2021-06-25,34165503,,"Neoadjuvant Trastuzumab, Pertuzumab, and Docet..."
3,Aglaia Schiza,6,This study analyzes the potential of stromal p...,"[('Carina', 'Strell'), ('Dick', 'Folkvaljon'),...",,©2021 American Association for Cancer Research.,Clinical cancer research : an official journal...,[],,2021-05-07,33952629,PDGFRb score was predictive for RT benefit wit...,High PDGFRb Expression Predicts Resistance to ...
4,Aglaia Schiza,6,There are conflicting results on the potential...,"[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('...","Our study results, based on propensity-matched...",,Breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",,2020-12-02,33258078\n16000569\n26211827\n29242041\n279564...,"After propensity score matching, 4368 patients...",Predictive role of HER2-status on the effectiv...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4142,Göran Carlstedt,0,,,,,,,,,,,
4143,Mikael Wallander,1,While recent randomised phase III trials show ...,"[('Mikael', 'Wallander'), ('Bo', 'Rolander'), ...",,2020 Journal of Gastrointestinal Oncology. All...,Journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",,2020-09-22,32953145\n27522626\n23438360\n31914811\n274124...,,Real world aspects of palliative trifluridine ...
4144,Frida Jakobsson,3,Treating localized prostate cancer (PC) with c...,"[('Johan Staby', 'Olsén'), ('Dalia', 'Estefan'...",,Copyright © 2022 Termedia.,Journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",,2022-03-03,35233229\n15465142\n20141674\n27771243\n233183...,,Predicting toxicity caused by high-dose-rate b...
4145,Frida Jakobsson,3,The benefit of imaging in the follow-up settin...,"[('Ylva', 'Naeser'), ('Hildur', 'Helgadottir')...",,,Cancers,"['X-ray computed', 'follow-up studies', 'melan...",,2022-02-26,35205786\n9440735\n27183845\n8433390\n11832252...,,Quality of Life in the First Year of Follow-Up...
