# Preparation of PubMed data

This notebook deals with the PubMed data. It first takes a look at the basic structure of the data. Then, it analyzes the PubMed data and prepares it. 

In [1]:
import numpy as np
import pandas as pd

import requests
import time

import json
import os

import pandas as pd
from bs4 import BeautifulSoup
import lxml

from gensim.parsing.preprocessing import remove_stopwords

#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)

## Define helper functions

In [2]:
def print_index_and_value(data, variable):
    """
    Print the index and value of a variable in a data frame.
    """
    for i in range(len(data)):
        print(i)
        print(data[variable][i])

In [3]:
def truncate_names(data, variable):
    """
    Truncate names, i.e.only keep first letter of first name and last name).
    """
    for i in range(len(data)):
        names_list = data[variable][i].split(", ")
        names_list_truncated = []
        for j in range(len(names_list)):
            names_list_split = names_list[j].replace("-", " ").split(" ")
            name = names_list_split[0] + ' ' + names_list_split[-1] # keep only first letter of first name
            names_list_truncated += [name]
            
        names_truncated = ", ".join(names_list_truncated) 
            
        data.at[i, variable] = names_truncated
        
    return data

In [4]:
def clean_authors(data, pubmed=False, gs=False):
    """
    Clean the authors column (remove brackets and quotes).
    """
    if pubmed:
        for i in range(len(data)):
                authors = data['authors'].iloc[i].replace("[", "")\
                                                    .replace("]", "")\
                                                        .replace("('", "")\
                                                            .replace("')", "")\
                                                                .replace("', '", " ")\
                                                                    .replace("(\'", "")\
                                                                        .replace("\')", "")\
                                                                            .replace('("', '')\
                                                                                .replace('")', '')\
                                                                                    .replace("\', \'", " ")\
                                                                                        .replace('\', "', ' ')\
                                                                                            .replace('", \'', ' ')\
                                                                                                .replace("\'", "")
        
                authors = authors.split(", ")
        
                index_list = []
                for j in range(len(authors)):
                    if authors[j] == "(None":
                        index_list += [j] + [j+1]
        
                index_list = list(set(index_list))
                full_index_list = list(range(len(authors)))
                index_list_choose = list(set(full_index_list) - set(index_list))
                authors = [authors[k] for k in index_list_choose]
                
                authors = ", ".join(authors)
                
                data.at[i, 'authors'] = authors
        
        # remove the rows where no authors are left
        data = data[data['authors'] != ''].reset_index(drop=True)
        
    if gs:
        for i in range(len(data)):
            data.at[i, 'authors'] = data['authors'][i].strip()\
                                                        .replace("\xa0", "")\
                                                            .replace("…", "")\
                                                                . replace("'", "")
    
    return data

In [5]:
def remove_wrongly_scraped_papers(data):
    """
    Remove papers that were scraped but do not belong to the respective hcp.
    """
    index_list = []
    
    for i in range(len(data)):
        if data['hcp_name'][i] in data['authors'][i]:
            index_list += [i]
    
    data = data.iloc[index_list, :].reset_index(drop=True)
    
    return data

In [6]:
def clean_abstract(data, gs=False):
    """
    Clean abtracts.
    """
    for i in range(len(data)):
        abstract = data['abstract'][i]
        if pd.notnull(data.at[i,'abstract']):
            abstract = remove_stopwords(abstract)
            if gs:
                abstract = abstract.replace(" …", "")
            data.at[i, "abstract"] = abstract.lower().replace('[^\w\s]','')
    
    return data

In [7]:
def clean_titles(data, variable, gs = False):
    """
    Clean titles.
    """
    for i in range(len(data)):
        title = data[variable][i]
        if pd.notnull(data.at[i, variable]):
            title = remove_stopwords(title)
            
            if gs:
                title = title.replace("[BOOK][B] ", "")\
                                .replace("[CITATION][C] ", "")\
                                    .replace("[HTML][HTML] ", "")\
                                        .replace("[PDF][PDF] ", "")\
                                            .replace("\xa0…", "")
                
            data.at[i, variable] = title.lower()
                
    data[variable] = data[variable].str.replace('[^\w\s]','')
    
    return data

In [8]:
def clean_keywords(data):
    """
    Clean the keywords.
    """
    for i in range(len(data)):
        keywords = data["keywords"].iloc[i].replace('"',"'")\
                                                .replace("'", "")\
                                                    .strip('][')
        
        data.at[i, 'keywords'] = keywords
        
    data['keywords'] = data['keywords'].str.lower()
    
    data.loc[data["keywords"] == '', ["keywords"]] = np.nan
        
    return data

In [9]:
def combine_keywords(data):
    """
    Combine the keywords.
    """
    keywords_list = []

    for i in range(len(data)):
        if pd.notnull(data.at[i,'keywords']):
            keywords_list = keywords_list + data.loc[i, "keywords"].split(", ")
    
    keywords_list = list(set(keywords_list))
    
    return keywords_list

In [10]:
def update_num_articles(data):
    """
    Update the number of articles.
    """
    for i in range(len(data)):
        doctor = data['hcp_name'][i]
        num_papers = len(data[data['hcp_name'] == doctor])
        data.at[i, 'num_articles'] = num_papers
        
    return data

## Load data

In [11]:
# Read in data scraped from PubMed
hcp_df_pubmed = pd.read_csv("../../0_raw_data/web_scraping_data/results_queries_pm.csv")
hcp_df_pubmed.shape

(4147, 13)

In [12]:
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,conclusions,copyrights,journal,keywords,methods,publication_date,pubmed_id,results,title
0,Adel Bader Hamdalla,0,,,,,,,,,,,
1,Aglaia Schiza,6,The immune microenvironment is an important mo...,"[('Aglaia', 'Schiza'), ('Viktoria', 'Thurfjell...",High TILs are associated with higher IBE risk ...,Copyright © 2022 The Author(s). Published by E...,"European journal of cancer (Oxford, England : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",,2022-03-04,35236568,Most women (61.9%) showed a TILs prevalence of...,Tumour-infiltrating lymphocytes add prognostic...
2,Aglaia Schiza,6,Trastuzumab emtansine (T-DM1) is presently app...,"[('Thomas', 'Hatschek'), ('Theodoros', 'Foukak...",,,JAMA oncology,[],,2021-06-25,34165503,,"Neoadjuvant Trastuzumab, Pertuzumab, and Docet..."
3,Aglaia Schiza,6,This study analyzes the potential of stromal p...,"[('Carina', 'Strell'), ('Dick', 'Folkvaljon'),...",,©2021 American Association for Cancer Research.,Clinical cancer research : an official journal...,[],,2021-05-07,33952629,PDGFRb score was predictive for RT benefit wit...,High PDGFRb Expression Predicts Resistance to ...
4,Aglaia Schiza,6,There are conflicting results on the potential...,"[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('...","Our study results, based on propensity-matched...",,Breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",,2020-12-02,33258078\n16000569\n26211827\n29242041\n279564...,"After propensity score matching, 4368 patients...",Predictive role of HER2-status on the effectiv...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4142,Göran Carlstedt,0,,,,,,,,,,,
4143,Mikael Wallander,1,While recent randomised phase III trials show ...,"[('Mikael', 'Wallander'), ('Bo', 'Rolander'), ...",,2020 Journal of Gastrointestinal Oncology. All...,Journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",,2020-09-22,32953145\n27522626\n23438360\n31914811\n274124...,,Real world aspects of palliative trifluridine ...
4144,Frida Jakobsson,3,Treating localized prostate cancer (PC) with c...,"[('Johan Staby', 'Olsén'), ('Dalia', 'Estefan'...",,Copyright © 2022 Termedia.,Journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",,2022-03-03,35233229\n15465142\n20141674\n27771243\n233183...,,Predicting toxicity caused by high-dose-rate b...
4145,Frida Jakobsson,3,The benefit of imaging in the follow-up settin...,"[('Ylva', 'Naeser'), ('Hildur', 'Helgadottir')...",,,Cancers,"['X-ray computed', 'follow-up studies', 'melan...",,2022-02-26,35205786\n9440735\n27183845\n8433390\n11832252...,,Quality of Life in the First Year of Follow-Up...


## Initial preparatory steps

The following things should be done in the initial preparatory steps and the cleaning process:
* only keep relevant variables
* delete rows without authors (then, the only missing thing are missing keywords and missing titles)
* bring hcp_name into name format (first letter of first name, entire last name) 
* num_articles are updated 
* abstract: lower case, remove stopwords, etc. 
* authors: clean them to first letter of first name format
* journal: lower case 
* keywords: clean them 
* publication_date: rename to publication_year and only keep year
* title: lower case 
* add new column 'scraped_from' with value 'pm'

#### Missing data 

In [13]:
print("Count total NaN in each column:")
hcp_df_pubmed.isnull().sum()

Count total NaN in each column:


hcp_name               0
num_articles           0
abstract             380
authors               43
conclusions         3427
copyrights          2393
journal               43
keywords              43
methods             4077
publication_date      43
pubmed_id             43
results             2486
title                 67
dtype: int64

Therefore, only the rows where the authors, journal, keywords, publication date and title are missing are of interest. 

In [14]:
hcp_df_pubmed[hcp_df_pubmed['authors'].isnull()]

Unnamed: 0,hcp_name,num_articles,abstract,authors,conclusions,copyrights,journal,keywords,methods,publication_date,pubmed_id,results,title
0,Adel Bader Hamdalla,0,,,,,,,,,,,
134,Elisabeth Ryd Ausén,0,,,,,,,,,,,
274,Anna Maria Hasselgren Häll,0,,,,,,,,,,,
275,Anne-Kristine Andersson,0,,,,,,,,,,,
337,Birgitta Lind,0,,,,,,,,,,,
425,Christina Haapaniemi Olsson,0,,,,,,,,,,,
431,Malin Steenhoff,0,,,,,,,,,,,
432,Therése Widerberg,0,,,,,,,,,,,
453,Dawid Bulanda,0,,,,,,,,,,,
554,Yohana Collins Bikova,0,,,,,,,,,,,


Apparently, these are the authors for whom no papers could be scraped. So, this explains why there are 45 missings for the other variables of interest except titles where 68 missings are present. Drop these 45 rows.

In [15]:
hcp_df_pubmed[(hcp_df_pubmed['title'].isnull()) & (hcp_df_pubmed['authors'].notnull())]

Unnamed: 0,hcp_name,num_articles,abstract,authors,conclusions,copyrights,journal,keywords,methods,publication_date,pubmed_id,results,title
25,Ahmed Abbas Albu-Kareem,100,There is an extensive search for natural produ...,"[('Mohamed I', 'Alzarah'), ('Fayez', 'Althobia...",,,Animals : an open access journal from MDPI,"['Citrullus colocynthis', 'broiler', 'cell-med...",,2021-07-03,34208851\n31480196\n15206614\n32118068\n327223...,,
44,Ahmed Abbas Albu-Kareem,100,,"[('Serag Eldin I', 'Elbehairi'), ('Ahmed', 'Ez...",,Copyright © 2020 Elbehairi et al.,EXCLI journal,"['HePG2', 'LS-174T', 'MCF-7', 'Prosopis julifl...",,2020-11-17,33192211\n28577281\n31007071\n23595000\n884558...,,
498,Julia Hallerfelt,100,,"[('Julia', 'Möhring')]",,,MMW Fortschritte der Medizin,[],,2022-02-12,35146679,,
931,Gunnar Lengstrand,100,Anti-prostate specific membrane antigen (PSMA)...,"[('Amanda', 'Kristiansson'), ('Anders', 'Örbom...",,,Biomolecules,"['[177Lu]Lu-PSMA-617', '[99mTc]Tc-MAG3 imaging...",,2021-02-14,33579037\n23228112\n2032890\n25301760\n2716022...,,
1054,Johan Ahlgren,62,"Although small, node-negative breast cancer (i...","[('David', 'Jaraj'), ('Jonas', 'Höijer'), ('Li...",,© The Author(s) 2020. Published by Oxford Univ...,JNCI cancer spectrum,[],,2021-01-15,33442658\n8229123\n30739743\n21764391\n1511798...,,
1208,Jonas Bergh,100,"Although small, node-negative breast cancer (i...","[('David', 'Jaraj'), ('Jonas', 'Höijer'), ('Li...",,© The Author(s) 2020. Published by Oxford Univ...,JNCI cancer spectrum,[],,2021-01-15,33442658\n8229123\n30739743\n21764391\n1511798...,,
1268,Jonas Bergh,100,,"[('Jonas', 'Bergh')]",,,Lakartidningen,[],,2018-01-03,29292967,,
1292,Jonas Holm,27,The primary care physician's traditional patie...,"[('Hanna', 'Glock'), ('Veronica', 'Milos Nymbe...",,© 2021 Glock et al.,International journal of general medicine,"['attitude of health personnel', 'eHealth', 'g...",,2021-12-10,34880663\n18929686\n34156336\n33956524\n270719...,,
1298,Jonas Holm,27,,"[('Anne Jæhger', 'Bystrup'), ('Jonas Peter', '...",,,Ugeskrift for laeger,[],,2019-07-04,31267943,,
1484,Gudrun Linderkers,100,,"[('Elke', 'Pfaff'), ('Tiphaine', 'Adam de Beau...",,,JCO precision oncology,[],,2022-01-08,34994601,,


The missing papers are:

1. *Citrullus colocynthis* Seeds: A Potential Natural Immune Modulator Source for Broiler Reared under Chronic Heat Stress 
2. *Prosopis juliflora* leave extracts induce cell death of MCF-7, HepG2, and LS-174T cancer cell lines 
3. <sup> 177 </sup> Lu-PSMA-617 Therapy in Mice, with or without the Antioxidant α<sub>1</sub>-Microglobulin (A1M), Including Kidney Damage Assessment Using <sup> 99m </sup> Tc-MAG3 Imaging
4. **Long-Term Prognostication for 20 114 Women With Small and Node-Negative Breast Cancer (T1abN0).**
5. **Long-Term Prognostication for 20 114 Women With Small and Node-Negative Breast Cancer (T1abN0).**
6. Apropå! En välfungerande enhet.
7. *Attitudes*, *Barriers*, and *Concerns* Regarding Telemedicine Among Swedish Primary Care Physicians: A Qualitative Study.
8. Ensidig sekretorisk otitis media hos voksne kan være et tegn på malignitet.
9. *NTRK* Alterations in Pediatric High-Risk Malignancies Identified Through European Clinical Sequencing Programs Constitute Promising Drug Targets. (there are no papers by Gudrun Linderkers)
10. *Far Beyond the Moon: A History of Life Support Systems in the Space Age* by David P. D. Munns and Kärin Nickelsen (review) (there are no papers by Karin Maltenius)
11. **Long-Term Prognostication for 20 114 Women With Small and Node-Negative Breast Cancer (T1abN0).**
12. Nils Wilking och Anna Forsberg svarar Jakob Eberhard et al: - Skånsk GI-onkologi kolliderar med vetenskap och nationella riktlinjer
13. Reflexiones sobre la estrategia de vacunación en México para personas de 50 a 59 años. (there are no papers by Paulina Krywda) 
14. *In Vitro* Biofilm Formation on Aryl Ketone Polymer (AKP), A New Denture Material, Compared with That on Three Traditional Dental Denture Materials (there are no papers by Paulina Krywda)
15. *PHIP* - a novel candidate breast cancer susceptibility locus on 6q14.1. 
16. *BRCA2* Hypomorphic Missense Variants Confer Moderate Risks of Breast Cancer
17. **Long-Term Prognostication for 20 114 Women With Small and Node-Negative Breast Cancer (T1abN0).**
18. **Long-Term Prognostication for 20 114 Women With Small and Node-Negative Breast Cancer (T1abN0).**
19. *CDKN2A* genetic testing in melanoma-prone families in Sweden in the years 2015-2020: implications for novel national recommendations
20. Årets Nobelprisbelönta antikroppar i cancervården - Nu etableras immunterapi som primär behandling vid folksjukdomen cancer – med visionen att alla ska botas
21. ABC om - Perikardit och myokardit
22. *ERBB2* and *PTPN2 g* ene copy numbers as prognostic factors in HER2-positive metastatic breast cancer treated with trastuzumab
23. *CDKN2A* genetic testing in melanoma-prone families in Sweden in the years 2015-2020: implications for novel national recommendations

We can see that the titles can obviously not be scraped when the title contains an italic string, a superscript, a bold string, special characters (e.g. å, ñ, Å). In paper 21, we do not have any of these cases. At the moment, it is unclear why this string could not be scraped as title.

Furthermore, we have seen that some doctors do not have own papers on PubMed and that PubMed nevertheless scrapes papers for them. One solution to this problem is to later delete those papers where the doctor from the hcp_name column does not show up in the authors column.

For these 23 papers, no title is present but nearly all other variables of interest, except keywords.
The title itself is not hidden in any other variable. We keep the papers for now and later remove papers which were scraped but do not belong to any of the HCPs.

#### Drop rows without authors 

In [16]:
# Drop all rows where authors is NaN (all in all 45 of 4112 rows)
hcp_df_pubmed = hcp_df_pubmed.dropna(subset = ['authors']).reset_index(drop = True)
hcp_df_pubmed.shape

(4104, 13)

In [17]:
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,conclusions,copyrights,journal,keywords,methods,publication_date,pubmed_id,results,title
0,Aglaia Schiza,6,The immune microenvironment is an important mo...,"[('Aglaia', 'Schiza'), ('Viktoria', 'Thurfjell...",High TILs are associated with higher IBE risk ...,Copyright © 2022 The Author(s). Published by E...,"European journal of cancer (Oxford, England : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",,2022-03-04,35236568,Most women (61.9%) showed a TILs prevalence of...,Tumour-infiltrating lymphocytes add prognostic...
1,Aglaia Schiza,6,Trastuzumab emtansine (T-DM1) is presently app...,"[('Thomas', 'Hatschek'), ('Theodoros', 'Foukak...",,,JAMA oncology,[],,2021-06-25,34165503,,"Neoadjuvant Trastuzumab, Pertuzumab, and Docet..."
2,Aglaia Schiza,6,This study analyzes the potential of stromal p...,"[('Carina', 'Strell'), ('Dick', 'Folkvaljon'),...",,©2021 American Association for Cancer Research.,Clinical cancer research : an official journal...,[],,2021-05-07,33952629,PDGFRb score was predictive for RT benefit wit...,High PDGFRb Expression Predicts Resistance to ...
3,Aglaia Schiza,6,There are conflicting results on the potential...,"[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('...","Our study results, based on propensity-matched...",,Breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",,2020-12-02,33258078\n16000569\n26211827\n29242041\n279564...,"After propensity score matching, 4368 patients...",Predictive role of HER2-status on the effectiv...
4,Aglaia Schiza,6,The purpose was to evaluate the potential of d...,"[('Aglaia', 'Schiza'), ('Sandra', 'Irenaeus'),...",,,Scientific reports,[],,2019-12-04,31792256\n30451345\n19097774\n10673991\n175153...,,Evaluation of Diffusion-Weighted MRI and FDG-P...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4099,Anna Nyberg,33,To provide practicing nurse anaesthetists with...,"[('Roland', 'Nilsson'), ('Chatarina', 'Björdal...",,,Journal of clinical nursing,[],The review included a systematic search in thr...,2005-01-27,15669926,"A systematic search of The Cochrane Library, M...",Health risks and occupational exposure to vola...
4100,Mikael Wallander,1,While recent randomised phase III trials show ...,"[('Mikael', 'Wallander'), ('Bo', 'Rolander'), ...",,2020 Journal of Gastrointestinal Oncology. All...,Journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",,2020-09-22,32953145\n27522626\n23438360\n31914811\n274124...,,Real world aspects of palliative trifluridine ...
4101,Frida Jakobsson,3,Treating localized prostate cancer (PC) with c...,"[('Johan Staby', 'Olsén'), ('Dalia', 'Estefan'...",,Copyright © 2022 Termedia.,Journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",,2022-03-03,35233229\n15465142\n20141674\n27771243\n233183...,,Predicting toxicity caused by high-dose-rate b...
4102,Frida Jakobsson,3,The benefit of imaging in the follow-up settin...,"[('Ylva', 'Naeser'), ('Hildur', 'Helgadottir')...",,,Cancers,"['X-ray computed', 'follow-up studies', 'melan...",,2022-02-26,35205786\n9440735\n27183845\n8433390\n11832252...,,Quality of Life in the First Year of Follow-Up...


#### Irrelevant variables

Filter for relevant variables. Our variables of interest, i.e., the ones to be retained, are:
* hcp_name
* num_articles
* abstract
* authors
* journal
* keywords
* publication_date 
* title

In [18]:
hcp_df_pubmed = hcp_df_pubmed[['hcp_name', 'num_articles', 'abstract', 'authors', 'journal', 'keywords', 'publication_date', 'title']]
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_date,title
0,Aglaia Schiza,6,The immune microenvironment is an important mo...,"[('Aglaia', 'Schiza'), ('Viktoria', 'Thurfjell...","European journal of cancer (Oxford, England : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",2022-03-04,Tumour-infiltrating lymphocytes add prognostic...
1,Aglaia Schiza,6,Trastuzumab emtansine (T-DM1) is presently app...,"[('Thomas', 'Hatschek'), ('Theodoros', 'Foukak...",JAMA oncology,[],2021-06-25,"Neoadjuvant Trastuzumab, Pertuzumab, and Docet..."
2,Aglaia Schiza,6,This study analyzes the potential of stromal p...,"[('Carina', 'Strell'), ('Dick', 'Folkvaljon'),...",Clinical cancer research : an official journal...,[],2021-05-07,High PDGFRb Expression Predicts Resistance to ...
3,Aglaia Schiza,6,There are conflicting results on the potential...,"[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('...",Breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",2020-12-02,Predictive role of HER2-status on the effectiv...
4,Aglaia Schiza,6,The purpose was to evaluate the potential of d...,"[('Aglaia', 'Schiza'), ('Sandra', 'Irenaeus'),...",Scientific reports,[],2019-12-04,Evaluation of Diffusion-Weighted MRI and FDG-P...
...,...,...,...,...,...,...,...,...
4099,Anna Nyberg,33,To provide practicing nurse anaesthetists with...,"[('Roland', 'Nilsson'), ('Chatarina', 'Björdal...",Journal of clinical nursing,[],2005-01-27,Health risks and occupational exposure to vola...
4100,Mikael Wallander,1,While recent randomised phase III trials show ...,"[('Mikael', 'Wallander'), ('Bo', 'Rolander'), ...",Journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",2020-09-22,Real world aspects of palliative trifluridine ...
4101,Frida Jakobsson,3,Treating localized prostate cancer (PC) with c...,"[('Johan Staby', 'Olsén'), ('Dalia', 'Estefan'...",Journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",2022-03-03,Predicting toxicity caused by high-dose-rate b...
4102,Frida Jakobsson,3,The benefit of imaging in the follow-up settin...,"[('Ylva', 'Naeser'), ('Hildur', 'Helgadottir')...",Cancers,"['X-ray computed', 'follow-up studies', 'melan...",2022-02-26,Quality of Life in the First Year of Follow-Up...


#### Duplicates 

In [19]:
print(hcp_df_pubmed.duplicated().sum())
hcp_df_pubmed[hcp_df_pubmed.duplicated()]

0


Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_date,title


There are no duplicates.

In [21]:
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_date,title
0,Aglaia Schiza,6,The immune microenvironment is an important mo...,"[('Aglaia', 'Schiza'), ('Viktoria', 'Thurfjell...","European journal of cancer (Oxford, England : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",2022-03-04,Tumour-infiltrating lymphocytes add prognostic...
1,Aglaia Schiza,6,Trastuzumab emtansine (T-DM1) is presently app...,"[('Thomas', 'Hatschek'), ('Theodoros', 'Foukak...",JAMA oncology,[],2021-06-25,"Neoadjuvant Trastuzumab, Pertuzumab, and Docet..."
2,Aglaia Schiza,6,This study analyzes the potential of stromal p...,"[('Carina', 'Strell'), ('Dick', 'Folkvaljon'),...",Clinical cancer research : an official journal...,[],2021-05-07,High PDGFRb Expression Predicts Resistance to ...
3,Aglaia Schiza,6,There are conflicting results on the potential...,"[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('...",Breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",2020-12-02,Predictive role of HER2-status on the effectiv...
4,Aglaia Schiza,6,The purpose was to evaluate the potential of d...,"[('Aglaia', 'Schiza'), ('Sandra', 'Irenaeus'),...",Scientific reports,[],2019-12-04,Evaluation of Diffusion-Weighted MRI and FDG-P...
...,...,...,...,...,...,...,...,...
4099,Anna Nyberg,33,To provide practicing nurse anaesthetists with...,"[('Roland', 'Nilsson'), ('Chatarina', 'Björdal...",Journal of clinical nursing,[],2005-01-27,Health risks and occupational exposure to vola...
4100,Mikael Wallander,1,While recent randomised phase III trials show ...,"[('Mikael', 'Wallander'), ('Bo', 'Rolander'), ...",Journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",2020-09-22,Real world aspects of palliative trifluridine ...
4101,Frida Jakobsson,3,Treating localized prostate cancer (PC) with c...,"[('Johan Staby', 'Olsén'), ('Dalia', 'Estefan'...",Journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",2022-03-03,Predicting toxicity caused by high-dose-rate b...
4102,Frida Jakobsson,3,The benefit of imaging in the follow-up settin...,"[('Ylva', 'Naeser'), ('Hildur', 'Helgadottir')...",Cancers,"['X-ray computed', 'follow-up studies', 'melan...",2022-02-26,Quality of Life in the First Year of Follow-Up...


## Cleaning of relevant data

This part performs the cleaning of the relevant variables.
The cleaning is supposed to take place in the following order:
* 1. hcp_name
* 2. authors
* 3. abstract
* 4. title
* 5. journal
* 6. keywords
* 7. publication_date
* 8. num_articles
* 9. scraped_from
* 10. save data frame

### `hcp_name`

First, we explore the names in the column `hcp_name`.

In [22]:
hcp_df_pubmed['hcp_name'].unique().tolist()

['Aglaia Schiza',
 'Agneta Danfors',
 'Ahmed Kareem',
 'Alaa Haidar',
 'Ana Campos',
 'Andreas Nearchou',
 'Ulrika Bergqvist',
 'Ann Dreifaldt',
 'Marie Santonsson',
 'Anna Nordenskjöld',
 'Anna Väppling',
 'Anna Tzikas',
 'Anna Wennstig',
 'Antonios Valachis',
 'Elisabet Karlsson',
 'Barbro Linderholm',
 'Kristina Lindblom',
 'Maria Sandström',
 'Cecilia Graffman',
 'Cecilia Nilsson',
 'Chaido Chamalidou',
 'Charlotte Bratthäll',
 'Christina Stragliotto',
 'Claudia Lundgren',
 'Dan Lundstedt',
 'Julia Hallerfelt',
 'Yvonne Wengström',
 'Elisabet Lidbrink',
 'Elzbieta Dziedzic',
 'Eva Trampe',
 'Evangelos Digkas',
 'Fredrika Killander',
 'Gabriel Jonsson',
 'Gilberto Morgan',
 'Git Martenhed',
 'Greger Nilsson',
 'Gunnar Lengstrand',
 'Helena Björneklett',
 'Henrik Lindman',
 'Jamila Adra',
 'Jan Frisell',
 'Jenny Bergqvist',
 'Johan Ahlgren',
 'Johan Hartman',
 'Jonas Bergh',
 'Jonas Holm',
 'Jörn Schneede',
 'Judith Bjöhle',
 'Karolina Larsson',
 'Anna Lind',
 'Kenneth Villman',
 'Ki

The names in the column `hcp_name` do not follow a regular format: some names feature only one first name and once last name, e.g., Aglaia Schiza, while other names feature two first names and one last name, e.g., Ahmed Abbas Albu-Kareem, one first name and two last names, e.g., Ana Bosch Campos. Names connected by a hyphen are considered to be one name, e.g., Albu-Kareem is considered as one surname.

In the following, our convention for names is to only keep the first letter of the first name and the last name. The helper function `truncate_names()` does this for us.

In [23]:
hcp_df_pubmed = truncate_names(hcp_df_pubmed, 'hcp_name')
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_date,title
0,Aglaia Schiza,6,The immune microenvironment is an important mo...,"[('Aglaia', 'Schiza'), ('Viktoria', 'Thurfjell...","European journal of cancer (Oxford, England : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",2022-03-04,Tumour-infiltrating lymphocytes add prognostic...
1,Aglaia Schiza,6,Trastuzumab emtansine (T-DM1) is presently app...,"[('Thomas', 'Hatschek'), ('Theodoros', 'Foukak...",JAMA oncology,[],2021-06-25,"Neoadjuvant Trastuzumab, Pertuzumab, and Docet..."
2,Aglaia Schiza,6,This study analyzes the potential of stromal p...,"[('Carina', 'Strell'), ('Dick', 'Folkvaljon'),...",Clinical cancer research : an official journal...,[],2021-05-07,High PDGFRb Expression Predicts Resistance to ...
3,Aglaia Schiza,6,There are conflicting results on the potential...,"[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('...",Breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",2020-12-02,Predictive role of HER2-status on the effectiv...
4,Aglaia Schiza,6,The purpose was to evaluate the potential of d...,"[('Aglaia', 'Schiza'), ('Sandra', 'Irenaeus'),...",Scientific reports,[],2019-12-04,Evaluation of Diffusion-Weighted MRI and FDG-P...
...,...,...,...,...,...,...,...,...
4099,Anna Nyberg,33,To provide practicing nurse anaesthetists with...,"[('Roland', 'Nilsson'), ('Chatarina', 'Björdal...",Journal of clinical nursing,[],2005-01-27,Health risks and occupational exposure to vola...
4100,Mikael Wallander,1,While recent randomised phase III trials show ...,"[('Mikael', 'Wallander'), ('Bo', 'Rolander'), ...",Journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",2020-09-22,Real world aspects of palliative trifluridine ...
4101,Frida Jakobsson,3,Treating localized prostate cancer (PC) with c...,"[('Johan Staby', 'Olsén'), ('Dalia', 'Estefan'...",Journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",2022-03-03,Predicting toxicity caused by high-dose-rate b...
4102,Frida Jakobsson,3,The benefit of imaging in the follow-up settin...,"[('Ylva', 'Naeser'), ('Hildur', 'Helgadottir')...",Cancers,"['X-ray computed', 'follow-up studies', 'melan...",2022-02-26,Quality of Life in the First Year of Follow-Up...


### `authors`

First of all, explore the names and clean them.

In [24]:
print_index_and_value(hcp_df_pubmed, 'authors')

0
[('Aglaia', 'Schiza'), ('Viktoria', 'Thurfjell'), ('Axel', 'Stenmark Tullberg'), ('Helena', 'Olofsson'), ('Amanda', 'Lindberg'), ('Erik', 'Holmberg'), ('Troy', 'Bremer'), ('Patrick', 'Micke'), ('Per', 'Karlsson'), ('Fredrik', 'Wärnberg'), ('Carina', 'Strell')]
1
[('Thomas', 'Hatschek'), ('Theodoros', 'Foukakis'), ('Judith', 'Bjöhle'), ('Tobias', 'Lekberg'), ('Hanna', 'Fredholm'), ('Ellinor', 'Elinder'), ('Ana', 'Bosch'), ('Gyula', 'Pekar'), ('Henrik', 'Lindman'), ('Aglaia', 'Schiza'), ('Zakaria', 'Einbeigi'), ('Jamila', 'Adra'), ('Anne', 'Andersson'), ('Lena', 'Carlsson'), ('Ann Charlotte', 'Dreifaldt'), ('Erika', 'Isaksson-Friman'), ('Susanne', 'Agartz'), ('Edward', 'Azavedo'), ('Per', 'Grybäck'), ('Mats', 'Hellström'), ('Hemming', 'Johansson'), ('Claudia', 'Maes'), ('Ioannis', 'Zerdes'), ('Johan', 'Hartman'), ('Yvonne', 'Brandberg'), ('Jonas', 'Bergh')]
2
[('Carina', 'Strell'), ('Dick', 'Folkvaljon'), ('Erik', 'Holmberg'), ('Aglaia', 'Schiza'), ('Viktoria', 'Thurfjell'), ('Per', 'K

1018
[('Jenny', 'Bergqvist'), ('Staffan', 'Lundström'), ('Yvonne', 'Wengström')]
1019
[('Jenny', 'Bergqvist'), ('Gunnar', 'Ljunggren')]
1020
[('Jenny', 'Bergqvist'), ('Peter', 'Strang')]
1021
[('Jenny', 'Bergqvist'), ('Hanna', 'Iderberg'), ('Johan', 'Mesterton'), ('Roger', 'Henriksson')]
1022
[('Maria', 'Helde-Frankling'), ('Jenny', 'Bergqvist'), ('Caritha', 'Klasson'), ('Marie', 'Nordström'), ('Jonas', 'Höijer'), ('Peter', 'Bergman'), ('Linda', 'Björkhem-Bergman')]
1023
[('Maria', 'Helde-Frankling'), ('Jonas', 'Höijer'), ('Jenny', 'Bergqvist'), ('Linda', 'Björkhem-Bergman')]
1024
[('Peter', 'Strang'), ('Jenny', 'Bergqvist')]
1025
[('Jenny', 'Bergqvist'), ('Peter', 'Strang')]
1026
[('Jenny', 'Bergqvist'), ('Hanna', 'Iderberg'), ('Johan', 'Mesterton'), ('Nils', 'Bengtsson'), ('Björn', 'Wettermark'), ('Roger', 'Henriksson')]
1027
[('Maria', 'Helde-Frankling'), ('Jenny', 'Bergqvist'), ('Peter', 'Bergman'), ('Linda', 'Björkhem-Bergman')]
1028
[('Peter', 'Bergman'), ('Susanne', 'Sperneder')

[('Niclas', 'Olsson'), ('Petter', 'Carlsson'), ('Peter', 'James'), ('Karin', 'Hansson'), ('Sofia', 'Waldemarson'), ('Per', 'Malmström'), ('Mårten', 'Fernö'), ('Lisa', 'Ryden'), ('Christer', 'Wingren'), ('Carl A K', 'Borrebaeck')]
2162
[('Carina', 'Strand'), ('Martin', 'Bak'), ('Signe', 'Borgquist'), ('Gunilla', 'Chebil'), ('Anna-Karin', 'Falck'), ('Marie-Louise', 'Fjällskog'), ('Dorthe', 'Grabau'), ('Ingrid', 'Hedenfalk'), ('Karin', 'Jirström'), ('Marie', 'Klintman'), ('Per', 'Malmström'), ('Hans', 'Olsson'), ('Lisa', 'Rydén'), ('Olle', 'Stål'), ('Pär-Ola', 'Bendahl'), ('Mårten', 'Fernö')]
2163
[('Stefan', 'Broselid'), ('Benxu', 'Cheng'), ('Martin', 'Sjöström'), ('Kristina', 'Lövgren'), ('Heather L P', 'Klug-De Santiago'), ('Mattias', 'Belting'), ('Karin', 'Jirström'), ('Per', 'Malmström'), ('Björn', 'Olde'), ('Pär-Ola', 'Bendahl'), ('Linda', 'Hartman'), ('Mårten', 'Fernö'), ('L M Fredrik', 'Leeb-Lundberg')]
2164
[('Claudia', 'Allemani'), ('Milena', 'Sant'), ('Hannah K', 'Weir'), ('Lis

3161
[('Ankur', 'Pandita'), ('Matias', 'Ekstrand'), ('Sara', 'Bjursten'), ('Zhiyuan', 'Zhao'), ('Per', 'Fogelstrand'), ('Kristell', 'Le Gal'), ('Lars', 'Ny'), ('Martin O', 'Bergo'), ('Joakim', 'Karlsson'), ('Jonas A', 'Nilsson'), ('Levent M', 'Akyürek'), ('Malin C', 'Levin'), ('Jan', 'Borén'), ('Andrew J', 'Ewald'), ('Keith E', 'Mostov'), ('Max', 'Levin')]
3162
[('Sara', 'Bjursten'), ('Ankur', 'Pandita'), ('Zhiyuan', 'Zhao'), ('Charlotta', 'Fröjd'), ('Lars', 'Ny'), ('Christer', 'Jensen'), ('Tobias', 'Ullerstam'), ('Henrik', 'Jespersen'), ('Jan', 'Borén'), ('Malin', 'Levin'), ('Henrik', 'Zetterberg'), ('Anna', 'Rudin'), ('Max', 'Levin')]
3163
[('Sara', 'Bjursten'), ('Christoffer', 'Vannas'), ('Stefan', 'Filges'), ('Florian', 'Puls'), ('Ankur', 'Pandita'), ('Henrik', 'Fagman'), ('Anders', 'Ståhlberg'), ('Max', 'Levin')]
3164
[('Anna', 'Arheden'), ('Joanna', 'Skalenius'), ('Sara', 'Bjursten'), ('Ulrika', 'Stierner'), ('Lars', 'Ny'), ('Max', 'Levin'), ('Henrik', 'Jespersen')]
3165
[('Henri

[('Maria', 'Feldt'), ('Olöf', 'Bjarnadottir'), ('Siker', 'Kimbung'), ('Karin', 'Jirström'), ('Pär-Ola', 'Bendahl'), ('Srinivas', 'Veerla'), ('Dorthe', 'Grabau'), ('Ingrid', 'Hedenfalk'), ('Signe', 'Borgquist')]
3960
[('Olöf', 'Bjarnadottir'), ('Siker', 'Kimbung'), ('Ida', 'Johansson'), ('Srinivas', 'Veerla'), ('Mats', 'Jönsson'), ('Pär-Ola', 'Bendahl'), ('Dorthe', 'Grabau'), ('Ingrid', 'Hedenfalk'), ('Signe', 'Borgquist')]
3961
[('Olöf', 'Bjarnadottir'), ('Quinci', 'Romero'), ('Pär-Ola', 'Bendahl'), ('Karin', 'Jirström'), ('Lisa', 'Rydén'), ('Niklas', 'Loman'), ('Mathias', 'Uhlén'), ('Henrik', 'Johannesson'), ('Carsten', 'Rose'), ('Dorthe', 'Grabau'), ('Signe', 'Borgquist')]
3962
[('Fernanda', 'Costa Svedman'), ('Marie', 'Jalsenius'), ('Veronica', 'Höiom'), ('Vitali', 'Grozman'), ('Mattias', 'Bergqvist'), ('Fabian', 'Söderdahl'), ('Hanna', 'Eriksson'), ('Samuel', 'Rotstein'), ('Lars', 'Ny'), ('Paolo A', 'Ascierto'), ('Suzanne Egyhazi', 'Brage'), ('Hildur', 'Helgadottir')]
3963
[('Chika

Most author strings follow a regular structure: `[('first_name_1', 'last_name_1'), ... , ('first_name_n', 'last_name_n')]`. 
However, there are some exceptions to this:
1. For some papers, some authors' first and/or last names are unknown so that their missing name is represented by `None`, where `None` is not in single quotes. 

Example: Paper with index 3: `[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('Irma', 'Fredriksson'), (None, 'Anna-Karin Wennstig'), ('Antonios', 'Valachis')]`

In [26]:
# Look at above-mentioned example
hcp_df_pubmed['authors'][3]

"[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('Irma', 'Fredriksson'), (None, 'Anna-Karin Wennstig'), ('Antonios', 'Valachis')]"

We take a look at the papers where `None` shows up at least once.

In [27]:
# Take look at all papers that follow this pattern
for i in range(len(hcp_df_pubmed)):
        if 'None' in hcp_df_pubmed['authors'][i]:
            print(i)
            print(hcp_df_pubmed['authors'][i])

3
[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('Irma', 'Fredriksson'), (None, 'Anna-Karin Wennstig'), ('Antonios', 'Valachis')]
17
[(None, None), (None, None)]
20
[(None, None), (None, None)]
30
[('Albaro J', 'Nieto-Calvache'), ('Jose M', 'Palacios-Jaraquemada'), ('Gabriel', 'Osanan'), ('Rafael', 'Cortes-Charry'), ('Rozi A', 'Aryananda'), ('Vidyadhar B', 'Bangal'), ('Aziz', 'Slaoui'), ('Ahmed M', 'Abbas'), ('Godwin O', 'Akaba'), ('Zaman N', 'Joshua'), ('Lina M', 'Vergara Galliadi'), ('Alejandro S', 'Nieto-Calvache'), ('José E', 'Sanín-Blair'), ('Juan M', 'Burgos-Luna'), (None, None)]
32
[(None, None)]
34
[(None, None), (None, None)]
72
[('Claire', 'André'), ('Stéphane', 'Rehel'), ('Elizabeth', 'Kuhn'), ('Brigitte', 'Landeau'), ('Inès', 'Moulinet'), ('Edelweiss', 'Touron'), ('Valentin', 'Ourry'), ('Gwendoline', 'Le Du'), ('Florence', 'Mézenge'), ('Clémence', 'Tomadesso'), ('Robin', 'de Flores'), ('Alexandre', 'Bejanin'), ('Siya', 'Sherif'), ('Nicolas', 'Delcroix'), ('Alain', 'Manrique')

We take a look at the papers where `None` shows up for both first name and last name.

In [28]:
# Take look at all papers that follow this pattern
for i in range(len(hcp_df_pubmed)):
    if '(None, None)' in hcp_df_pubmed['authors'][i]:
            print(i)
            print(hcp_df_pubmed['authors'][i])

17
[(None, None), (None, None)]
20
[(None, None), (None, None)]
30
[('Albaro J', 'Nieto-Calvache'), ('Jose M', 'Palacios-Jaraquemada'), ('Gabriel', 'Osanan'), ('Rafael', 'Cortes-Charry'), ('Rozi A', 'Aryananda'), ('Vidyadhar B', 'Bangal'), ('Aziz', 'Slaoui'), ('Ahmed M', 'Abbas'), ('Godwin O', 'Akaba'), ('Zaman N', 'Joshua'), ('Lina M', 'Vergara Galliadi'), ('Alejandro S', 'Nieto-Calvache'), ('José E', 'Sanín-Blair'), ('Juan M', 'Burgos-Luna'), (None, None)]
32
[(None, None)]
34
[(None, None), (None, None)]
72
[('Claire', 'André'), ('Stéphane', 'Rehel'), ('Elizabeth', 'Kuhn'), ('Brigitte', 'Landeau'), ('Inès', 'Moulinet'), ('Edelweiss', 'Touron'), ('Valentin', 'Ourry'), ('Gwendoline', 'Le Du'), ('Florence', 'Mézenge'), ('Clémence', 'Tomadesso'), ('Robin', 'de Flores'), ('Alexandre', 'Bejanin'), ('Siya', 'Sherif'), ('Nicolas', 'Delcroix'), ('Alain', 'Manrique'), ('Ahmed', 'Abbas'), ('Natalie L', 'Marchant'), ('Antoine', 'Lutz'), ('Olga M', 'Klimecki'), ('Fabienne', 'Collette'), ('Eider 

We take a look at the papers where `None` shows up only for first name but not for last name.

In [29]:
# Take look at all papers that follow this pattern
for i in range(len(hcp_df_pubmed)):
    if '(None' in hcp_df_pubmed['authors'][i]:
            if 'None)' not in hcp_df_pubmed['authors'][i]:
                print(i)
                print(hcp_df_pubmed['authors'][i])

3
[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('Irma', 'Fredriksson'), (None, 'Anna-Karin Wennstig'), ('Antonios', 'Valachis')]
264
[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('Irma', 'Fredriksson'), (None, 'Anna-Karin Wennstig'), ('Antonios', 'Valachis')]
279
[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('Irma', 'Fredriksson'), (None, 'Anna-Karin Wennstig'), ('Antonios', 'Valachis')]
2706
[('Aglaia', 'Schiza'), ('Davide', 'Mauri'), ('Irma', 'Fredriksson'), (None, 'Anna-Karin Wennstig'), ('Antonios', 'Valachis')]
2935
[('Margarita', 'Maurer-Granofszky'), ('Angela', 'Schumich'), ('Barbara', 'Buldini'), ('Giuseppe', 'Gaipa'), ('Janos', 'Kappelmayer'), ('Ester', 'Mejstrikova'), ('Leonid', 'Karawajew'), ('Jorge', 'Rossi'), ('Adın Çınar', 'Suzan'), ('Evangelina', 'Agriello'), ('Theodora', 'Anastasiou-Grenzelia'), ('Virna', 'Barcala'), ('Gábor', 'Barna'), ('Drago', 'Batinić'), ('Jean-Pierre', 'Bourquin'), ('Monika', 'Brüggemann'), ('Karolina', 'Bukowska-Strakova'), ('Hasan', 'Burnusuzov'), (

We take a look at the papers where `None` shows up only for last name but not for first name. We see that this never happens.

In [30]:
# Take look at all papers that follow this pattern
for i in range(len(hcp_df_pubmed)):
        if 'None)' in hcp_df_pubmed['authors'][i]:
            if '(None' not in hcp_df_pubmed['authors'][i]:
                print(i)
                print(hcp_df_pubmed['authors'][i])

2. For some papers, the names of the authors have a special feature. Either the first name and/or the last name contains an apostrophe. The respective name is not in single quotes but in double quotes. 

Example: Paper with index 165: `[('Sofia', "Dall'Orso"), ('Tomoki', 'Arichi'), ('Sean P', 'Fitzgibbon'), ('A David', 'Edwards'), ('Etienne', 'Burdet'), ('Silvia', 'Muceli')]`

In [31]:
# Look at above-mentioned example
hcp_df_pubmed['authors'][165]

"[('Francesca', 'Prati'), ('Sarina J', 'Schaefer'), ('Miles', 'Hewstone'), ('Oliver', 'Christ')]"

We take a look at all the papers where at least one author name contains an apostrophe.

In [32]:
# Take look at all papers that follow this pattern
for i in range(len(hcp_df_pubmed)):
        if '"' in hcp_df_pubmed['authors'][i]:
            print(i)
            print(hcp_df_pubmed['authors'][i])

150
[('Alberto', 'Granzotto'), ('Marco', "d'Aurora"), ('Manuela', 'Bomba'), ('Valentina', 'Gatta'), ('Marco', 'Onofrj'), ('Stefano L', 'Sensi')]
162
[('Marie-Paule Bernadette', "N'Cho-Mottoh"), ('Olivier', 'Huttin'), ('Christine', 'Selton-Suty'), ('Soukaina', 'Scadi'), ('Laura', 'Filippetti'), ('Pierre-Yves', 'Marie')]
225
[('A J', 'Campbell'), ('R', 'Dotel'), ('M', 'Braddick'), ('P N', 'Britton'), ('D P', 'Eisen'), ('J R', 'Francis'), ('S', 'Lynar'), ('B', 'McMullan'), ('N', 'Meagher'), ('J', 'Nelson'), ('M V N', "O'Sullivan"), ('D J', 'Price'), ('J O', 'Robinson'), ('A', 'Whelan'), ('S Y C', 'Tong'), ('A C', 'Bowen'), ('J S', 'Davis')]
298
[('Catherine', 'Weadick'), ('Karolina', 'Larsson'), ('Seamus', "O'Reilly"), ('Eileen', 'McMahon'), ('Deirdre', "O'Mahony"), ('Barbro K', 'Linderholm')]
411
[('Marco', 'Colleoni'), ('Weixiu', 'Luo'), ('Per', 'Karlsson'), ('Jacquie', 'Chirgwin'), ('Stefan', 'Aebi'), ('Guy', 'Jerusalem'), ('Patrick', 'Neven'), ('Erika', 'Hitre'), ('Marie-Pascale', 'Gr

For each paper, we would like to obtain a string where authors are comma-separated, i.e. 'author_1, author_2, ..., author_n'.
A thorough inspection of the above authors outputs yields the conclusion that the following steps are necessary - !in exactly this order! - to achieve this form: 

- replace `[` by empty string
- replace `]` by empty string
- replace `('` by empty string
- replace `')` by empty string
- replace `', '` by one blank space
- replace `(\'` by empty string
- replace `\')` by empty string
- replace `("` by empty string
- replace `")` by empty string
- replace `\', \'` by one blank space
- replace `\', "` by one blank space
- replace `", \'` by one blank space
- replace `\'` by empty string

- remove `None` strings and the corresponding name afterwards

The helper function `clean_authors()` does this for us. Now the names are cleaned, i.e., all apostrophes, brackets etc. are removed and `None` strings are deleted.

In [33]:
hcp_df_pubmed = clean_authors(hcp_df_pubmed, pubmed=True) 
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_date,title
0,Aglaia Schiza,6,The immune microenvironment is an important mo...,"Aglaia Schiza, Viktoria Thurfjell, Axel Stenma...","European journal of cancer (Oxford, England : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",2022-03-04,Tumour-infiltrating lymphocytes add prognostic...
1,Aglaia Schiza,6,Trastuzumab emtansine (T-DM1) is presently app...,"Thomas Hatschek, Theodoros Foukakis, Judith Bj...",JAMA oncology,[],2021-06-25,"Neoadjuvant Trastuzumab, Pertuzumab, and Docet..."
2,Aglaia Schiza,6,This study analyzes the potential of stromal p...,"Carina Strell, Dick Folkvaljon, Erik Holmberg,...",Clinical cancer research : an official journal...,[],2021-05-07,High PDGFRb Expression Predicts Resistance to ...
3,Aglaia Schiza,6,There are conflicting results on the potential...,"Aglaia Schiza, Davide Mauri, Irma Fredriksson,...",Breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",2020-12-02,Predictive role of HER2-status on the effectiv...
4,Aglaia Schiza,6,The purpose was to evaluate the potential of d...,"Aglaia Schiza, Sandra Irenaeus, Francisco Orti...",Scientific reports,[],2019-12-04,Evaluation of Diffusion-Weighted MRI and FDG-P...
...,...,...,...,...,...,...,...,...
4085,Anna Nyberg,33,To provide practicing nurse anaesthetists with...,"Roland Nilsson, Chatarina Björdal, Matts Ander...",Journal of clinical nursing,[],2005-01-27,Health risks and occupational exposure to vola...
4086,Mikael Wallander,1,While recent randomised phase III trials show ...,"Mikael Wallander, Bo Rolander, Elisabeth Åvall...",Journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",2020-09-22,Real world aspects of palliative trifluridine ...
4087,Frida Jakobsson,3,Treating localized prostate cancer (PC) with c...,"Johan Staby Olsén, Dalia Estefan, Antonios Val...",Journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",2022-03-03,Predicting toxicity caused by high-dose-rate b...
4088,Frida Jakobsson,3,The benefit of imaging in the follow-up settin...,"Ylva Naeser, Hildur Helgadottir, Johan Hansson...",Cancers,"['X-ray computed', 'follow-up studies', 'melan...",2022-02-26,Quality of Life in the First Year of Follow-Up...


In [34]:
# Checks for None
for entry in hcp_df_pubmed[hcp_df_pubmed['authors'].str.contains("None", na = False)]['authors']:
    print(entry) # no papers left

In [35]:
# Checks for '
for entry in hcp_df_pubmed[hcp_df_pubmed['authors'].str.contains("'", na = False)]['authors']:
    print(entry) # no papers left

In order for the author names to follow our convention, we now need to truncate the names: keep only first letter of first name and last name. Again, the helper function `truncate_names()` does this for us. 

In [36]:
hcp_df_pubmed = truncate_names(hcp_df_pubmed, 'authors')
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_date,title
0,Aglaia Schiza,6,The immune microenvironment is an important mo...,"Aglaia Schiza, Viktoria Thurfjell, Axel Tullbe...","European journal of cancer (Oxford, England : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",2022-03-04,Tumour-infiltrating lymphocytes add prognostic...
1,Aglaia Schiza,6,Trastuzumab emtansine (T-DM1) is presently app...,"Thomas Hatschek, Theodoros Foukakis, Judith Bj...",JAMA oncology,[],2021-06-25,"Neoadjuvant Trastuzumab, Pertuzumab, and Docet..."
2,Aglaia Schiza,6,This study analyzes the potential of stromal p...,"Carina Strell, Dick Folkvaljon, Erik Holmberg,...",Clinical cancer research : an official journal...,[],2021-05-07,High PDGFRb Expression Predicts Resistance to ...
3,Aglaia Schiza,6,There are conflicting results on the potential...,"Aglaia Schiza, Davide Mauri, Irma Fredriksson,...",Breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",2020-12-02,Predictive role of HER2-status on the effectiv...
4,Aglaia Schiza,6,The purpose was to evaluate the potential of d...,"Aglaia Schiza, Sandra Irenaeus, Francisco Niet...",Scientific reports,[],2019-12-04,Evaluation of Diffusion-Weighted MRI and FDG-P...
...,...,...,...,...,...,...,...,...
4085,Anna Nyberg,33,To provide practicing nurse anaesthetists with...,"Roland Nilsson, Chatarina Björdal, Matts Ander...",Journal of clinical nursing,[],2005-01-27,Health risks and occupational exposure to vola...
4086,Mikael Wallander,1,While recent randomised phase III trials show ...,"Mikael Wallander, Bo Rolander, Elisabeth Lundq...",Journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",2020-09-22,Real world aspects of palliative trifluridine ...
4087,Frida Jakobsson,3,Treating localized prostate cancer (PC) with c...,"Johan Olsén, Dalia Estefan, Antonios Valachis,...",Journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",2022-03-03,Predicting toxicity caused by high-dose-rate b...
4088,Frida Jakobsson,3,The benefit of imaging in the follow-up settin...,"Ylva Naeser, Hildur Helgadottir, Johan Hansson...",Cancers,"['X-ray computed', 'follow-up studies', 'melan...",2022-02-26,Quality of Life in the First Year of Follow-Up...


### Filtering of papers by correspondence of hcp_name and authors

As we saw above, sometimes papers are scraped for certain HCPs even if there are no papers of them on PubMed at all. 

Example: For Ahmed Abbas Albu-Kareem (A Albu-Kareem), there are no papers on PubMed. Nevertheless, PyMed scrapes 100 papers, the maximum amount of papers, for him.

Therefore, we need to remove those papers where the name from `hcp_name` does not show up in the names from `authors`.

The helper function `remove_wrongly_scraped_papers()` does this for us.

In [37]:
hcp_df_pubmed = remove_wrongly_scraped_papers(hcp_df_pubmed)
hcp_df_pubmed.shape

(2664, 8)

In [38]:
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_date,title
0,Aglaia Schiza,6,The immune microenvironment is an important mo...,"Aglaia Schiza, Viktoria Thurfjell, Axel Tullbe...","European journal of cancer (Oxford, England : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",2022-03-04,Tumour-infiltrating lymphocytes add prognostic...
1,Aglaia Schiza,6,Trastuzumab emtansine (T-DM1) is presently app...,"Thomas Hatschek, Theodoros Foukakis, Judith Bj...",JAMA oncology,[],2021-06-25,"Neoadjuvant Trastuzumab, Pertuzumab, and Docet..."
2,Aglaia Schiza,6,This study analyzes the potential of stromal p...,"Carina Strell, Dick Folkvaljon, Erik Holmberg,...",Clinical cancer research : an official journal...,[],2021-05-07,High PDGFRb Expression Predicts Resistance to ...
3,Aglaia Schiza,6,There are conflicting results on the potential...,"Aglaia Schiza, Davide Mauri, Irma Fredriksson,...",Breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",2020-12-02,Predictive role of HER2-status on the effectiv...
4,Aglaia Schiza,6,The purpose was to evaluate the potential of d...,"Aglaia Schiza, Sandra Irenaeus, Francisco Niet...",Scientific reports,[],2019-12-04,Evaluation of Diffusion-Weighted MRI and FDG-P...
...,...,...,...,...,...,...,...,...
2659,Anna Nyberg,33,To provide practicing nurse anaesthetists with...,"Roland Nilsson, Chatarina Björdal, Matts Ander...",Journal of clinical nursing,[],2005-01-27,Health risks and occupational exposure to vola...
2660,Mikael Wallander,1,While recent randomised phase III trials show ...,"Mikael Wallander, Bo Rolander, Elisabeth Lundq...",Journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",2020-09-22,Real world aspects of palliative trifluridine ...
2661,Frida Jakobsson,3,Treating localized prostate cancer (PC) with c...,"Johan Olsén, Dalia Estefan, Antonios Valachis,...",Journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",2022-03-03,Predicting toxicity caused by high-dose-rate b...
2662,Frida Jakobsson,3,The benefit of imaging in the follow-up settin...,"Ylva Naeser, Hildur Helgadottir, Johan Hansson...",Cancers,"['X-ray computed', 'follow-up studies', 'melan...",2022-02-26,Quality of Life in the First Year of Follow-Up...


### `abstract`

The abstracts of the individual papers contain a lot of stopwords, i.e., words which are necessary to build a sentence but which do not give much meaning to a sentence, e.g., 'is', 'has', 'on', 'to' etc. 

The following example shows this.

In [40]:
hcp_df_pubmed['abstract'][0]

'The immune microenvironment is an important modulator of tumour progression and treatment response. In invasive breast cancer, assessment of tumour-infiltrating lymphocytes (TILs) provides prognostic and predictive information. However, the clinical impact of TILs for ductal carcinoma in situ (DCIS) has not yet been demonstrated.\nPost hoc analysis of the SweDCIS randomised radiotherapy trial including primary DCIS cases following breast-conserving surgery. TILs were assessed on haematoxylin-eosin sections (n\xa0=\xa0711) according to the International Immuno-Oncology Biomarker Working Group guidelines. TILs-scores were analysed as continuous and dichotomised (≤5% versus >5%) variable regarding ipsilateral breast events (IBEs) as the predefined primary endpoint.\nMost women (61.9%) showed a TILs prevalence of ≤5%. High TILs-scores were associated with larger lesion size, human epidermal growth factor receptor 2 (HER2)-positivity, higher nuclear grade, and KI67-score. DCIS cases with h

We would like to remove these stopwords. In addition, we would like to replace special characters and convert the entire abstracts to lower case.

The helper function `clean_abstract()` does this for us.

In [39]:
hcp_df_pubmed = clean_abstract(hcp_df_pubmed)
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_date,title
0,Aglaia Schiza,6,the immune microenvironment important modulato...,"Aglaia Schiza, Viktoria Thurfjell, Axel Tullbe...","European journal of cancer (Oxford, England : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",2022-03-04,Tumour-infiltrating lymphocytes add prognostic...
1,Aglaia Schiza,6,trastuzumab emtansine (t-dm1) presently approv...,"Thomas Hatschek, Theodoros Foukakis, Judith Bj...",JAMA oncology,[],2021-06-25,"Neoadjuvant Trastuzumab, Pertuzumab, and Docet..."
2,Aglaia Schiza,6,this study analyzes potential stromal platelet...,"Carina Strell, Dick Folkvaljon, Erik Holmberg,...",Clinical cancer research : an official journal...,[],2021-05-07,High PDGFRb Expression Predicts Resistance to ...
3,Aglaia Schiza,6,there conflicting results potential role her2-...,"Aglaia Schiza, Davide Mauri, Irma Fredriksson,...",Breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",2020-12-02,Predictive role of HER2-status on the effectiv...
4,Aglaia Schiza,6,the purpose evaluate potential diffusion-weigh...,"Aglaia Schiza, Sandra Irenaeus, Francisco Niet...",Scientific reports,[],2019-12-04,Evaluation of Diffusion-Weighted MRI and FDG-P...
...,...,...,...,...,...,...,...,...
2659,Anna Nyberg,33,to provide practicing nurse anaesthetists evid...,"Roland Nilsson, Chatarina Björdal, Matts Ander...",Journal of clinical nursing,[],2005-01-27,Health risks and occupational exposure to vola...
2660,Mikael Wallander,1,while recent randomised phase iii trials trifl...,"Mikael Wallander, Bo Rolander, Elisabeth Lundq...",Journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",2020-09-22,Real world aspects of palliative trifluridine ...
2661,Frida Jakobsson,3,treating localized prostate cancer (pc) combin...,"Johan Olsén, Dalia Estefan, Antonios Valachis,...",Journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",2022-03-03,Predicting toxicity caused by high-dose-rate b...
2662,Frida Jakobsson,3,the benefit imaging follow-up setting high-ris...,"Ylva Naeser, Hildur Helgadottir, Johan Hansson...",Cancers,"['X-ray computed', 'follow-up studies', 'melan...",2022-02-26,Quality of Life in the First Year of Follow-Up...


We check the abstract again.

In [40]:
hcp_df_pubmed['abstract'][0]

'the immune microenvironment important modulator tumour progression treatment response. in invasive breast cancer, assessment tumour-infiltrating lymphocytes (tils) provides prognostic predictive information. however, clinical impact tils ductal carcinoma situ (dcis) demonstrated. post hoc analysis swedcis randomised radiotherapy trial including primary dcis cases following breast-conserving surgery. tils assessed haematoxylin-eosin sections (n = 711) according international immuno-oncology biomarker working group guidelines. tils-scores analysed continuous dichotomised (≤5% versus >5%) variable ipsilateral breast events (ibes) predefined primary endpoint. most women (61.9%) showed tils prevalence ≤5%. high tils-scores associated larger lesion size, human epidermal growth factor receptor 2 (her2)-positivity, higher nuclear grade, ki67-score. dcis cases high tils prevalence significant increased cumulative ibe incidence years post-surgery (tils high tils associated higher ibe risk 5-yea

### `title`

Let us look at the different titles.

In [41]:
print_index_and_value(hcp_df_pubmed, 'title')

0
Tumour-infiltrating lymphocytes add prognostic information for patients with low-risk DCIS: findings from the SweDCIS randomised radiotherapy trial.
1
Neoadjuvant Trastuzumab, Pertuzumab, and Docetaxel vs Trastuzumab Emtansine in Patients With ERBB2-Positive Breast Cancer: A Phase 2 Randomized Clinical Trial.
2
High PDGFRb Expression Predicts Resistance to Radiotherapy in DCIS within the SweDCIS Randomized Trial.
3
Predictive role of HER2-status on the effectiveness of endocrine adjuvant treatment in postmenopausal breast cancer patients: a population-based cohort study.
4
Evaluation of Diffusion-Weighted MRI and FDG-PET/CT to Assess Response to AdCD40L treatment in Metastatic Melanoma Patients.
5
Local irradiation does not enhance the effect of immunostimulatory AdCD40L gene therapy combined with low dose cyclophosphamide in melanoma patients.
6
Quality of Life in the First Year of Follow-Up in a Randomized Multicenter Trial Assessing the Role of Imaging after Radical Surgery of Sta

Adjuvant chemotherapy with fluorouracil plus folinic acid vs gemcitabine following pancreatic cancer resection: a randomized controlled trial.
1094
Randomized phase II study of sequential docetaxel and irinotecan with 5-fluorouracil/folinic acid (leucovorin) in patients with advanced gastric cancer: the GATAC trial.
1095
Multicentre phase I-II trial of capecitabine and oxaliplatin in combination with radiotherapy for unresectable pancreatic and biliary tract cancer: The CORGI-U study.
1096
Interobserver variation of clinical target volume delineation in gastric cancer.
1097
[Adjuvant therapy of ventricular cancer tested in Sweden].
1098
Efficacy of preoperative radiochemotherapy in patients with locally advanced pancreatic carcinoma.
1099
Efficacy of pegylated liposomal doxorubicin in patients with advanced hepatocellular carcinoma.
1100
Quantitative assessment of lung density changes after 3-D radiotherapy for breast cancer.
1101
Advances in the treatment of patients with gastric aden

2093
The clinical use of N-terminal-pro brain natriuretic peptide in elderly patients with mental illness.
2094
Plasma homocysteine--a marker of vascular disease in elderly patients with mental illness.
2095
Complete compensation in Daphnia fecundity and stage-specific biomass in response to size-independent mortality.
2096
Plasma homocysteine, apolipoprotein E status and vascular disease in elderly patients with mental illness.
2097
Invasion success depends on invader body size in a size-structured mixed predation-competition community.
2098
A Swedish family with de novo alpha-synuclein A53T mutation: evidence for early cortical dysfunction.
2099
Association between plasma homocysteine levels and mortality in elderly patients with mental illness.
2100
Cis-acting resistance peptides reveal dual ribosome inhibitory action of the macrolide josamycin.
2101
Drug efflux pump deficiency and drug target resistance masking in growing bacteria.
2102
Reduced CSF CART in dementia with Lewy bodies

We want to remove punctuation and stopwords from the titles and convert them to lower case.

The helper function `clean_titles()` does this for us.

In [42]:
hcp_df_pubmed = clean_titles(hcp_df_pubmed, 'title', gs = False)
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_date,title
0,Aglaia Schiza,6,the immune microenvironment important modulato...,"Aglaia Schiza, Viktoria Thurfjell, Axel Tullbe...","European journal of cancer (Oxford, England : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",2022-03-04,tumourinfiltrating lymphocytes add prognostic ...
1,Aglaia Schiza,6,trastuzumab emtansine (t-dm1) presently approv...,"Thomas Hatschek, Theodoros Foukakis, Judith Bj...",JAMA oncology,[],2021-06-25,neoadjuvant trastuzumab pertuzumab docetaxel v...
2,Aglaia Schiza,6,this study analyzes potential stromal platelet...,"Carina Strell, Dick Folkvaljon, Erik Holmberg,...",Clinical cancer research : an official journal...,[],2021-05-07,high pdgfrb expression predicts resistance rad...
3,Aglaia Schiza,6,there conflicting results potential role her2-...,"Aglaia Schiza, Davide Mauri, Irma Fredriksson,...",Breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",2020-12-02,predictive role her2status effectiveness endoc...
4,Aglaia Schiza,6,the purpose evaluate potential diffusion-weigh...,"Aglaia Schiza, Sandra Irenaeus, Francisco Niet...",Scientific reports,[],2019-12-04,evaluation diffusionweighted mri fdgpetct asse...
...,...,...,...,...,...,...,...,...
2659,Anna Nyberg,33,to provide practicing nurse anaesthetists evid...,"Roland Nilsson, Chatarina Björdal, Matts Ander...",Journal of clinical nursing,[],2005-01-27,health risks occupational exposure volatile an...
2660,Mikael Wallander,1,while recent randomised phase iii trials trifl...,"Mikael Wallander, Bo Rolander, Elisabeth Lundq...",Journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",2020-09-22,real world aspects palliative trifluridine plu...
2661,Frida Jakobsson,3,treating localized prostate cancer (pc) combin...,"Johan Olsén, Dalia Estefan, Antonios Valachis,...",Journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",2022-03-03,predicting toxicity caused highdoserate brachy...
2662,Frida Jakobsson,3,the benefit imaging follow-up setting high-ris...,"Ylva Naeser, Hildur Helgadottir, Johan Hansson...",Cancers,"['X-ray computed', 'follow-up studies', 'melan...",2022-02-26,quality life first year followup randomized mu...


We check the titles again.

In [43]:
print_index_and_value(hcp_df_pubmed, 'title')

0
tumourinfiltrating lymphocytes add prognostic information patients lowrisk dcis findings swedcis randomised radiotherapy trial
1
neoadjuvant trastuzumab pertuzumab docetaxel vs trastuzumab emtansine patients with erbb2positive breast cancer a phase 2 randomized clinical trial
2
high pdgfrb expression predicts resistance radiotherapy dcis swedcis randomized trial
3
predictive role her2status effectiveness endocrine adjuvant treatment postmenopausal breast cancer patients populationbased cohort study
4
evaluation diffusionweighted mri fdgpetct assess response adcd40l treatment metastatic melanoma patients
5
local irradiation enhance effect immunostimulatory adcd40l gene therapy combined low dose cyclophosphamide melanoma patients
6
quality life first year followup randomized multicenter trial assessing role imaging radical surgery stage iibc iii cutaneous melanoma trim study
7
transcriptional regulatory networks gene ontology expression data
8
transcriptional regulatory network discove

ultrahypofractionated versus conventionally fractionated radiotherapy prostate cancer 5year outcomes hyportpc randomised noninferiority phase 3 trial
867
radiotherapy equipment departments european countries final results estrohero survey
868
clinical outcome following radiotherapy planned neck dissection n head neck cancer patients
869
the quality assurance process artscan head neck study  practical interactive approach qa 3dcrt imrt
870
prospective randomised multicenter trial single fraction radiotherapy 8 gy x 1 versus multiple fractions 3 gy x 10 treatment painful bone metastases
871
significance rna reference tumourrelated gene expression analyses cdna array
872
average duration prior treatment lines predicts clinical benefit eribulin chemotherapy patients metastatic breast cancer
873
tumor coexpression progranulin sortilin prognostic biomarker breast cancer
874
survival breast cancer women brca2 mutation treatment
875
tumourinfiltrating lymphocytes prognostic tamoxifen predictiv

arntdependent hif2 transcriptional activity sufficient regulate downstream target genes neuroblastoma
2172
maintaining multipotent trunk neural crest stem cells selfrenewing crestospheres
2173
promoterassociated proteins epas1 identified enchipms  a putative role hdx negative regulator
2174
combined bet bromodomain cdk2 inhibition mycdriven medulloblastoma
2175
neuroblastoma patientderived xenograft cells cultured stemcell promoting medium retain tumorigenic metastatic capacities differentiate serum
2176
hif2α contributes antiestrogen resistance positive bilateral crosstalk egfr breast cancer cells
2177
the retinoblastoma gene undergoes rearrangements brca1deficient basallike breast cancer
2178
a synthetic polyphosphoinositide headgroup surrogate complex ship2 provides rationale drug discovery
2179
comparative structural analysis lipid binding start domains
2180
structural basis tumor suppressor lung cancer 1 tslc1 binding differentially expressed adenocarcinoma lung dal141b
2181
the c

### `journal`

In [44]:
print_index_and_value(hcp_df_pubmed, 'journal')

0
European journal of cancer (Oxford, England : 1990)
1
JAMA oncology
2
Clinical cancer research : an official journal of the American Association for Cancer Research
3
Breast cancer research and treatment
4
Scientific reports
5
Oncotarget
6
Cancers
7
In silico biology
8
Algorithms for molecular biology : AMB
9
Breast cancer research : BCR
10
Breast cancer research : BCR
11
Science translational medicine
12
Oncoimmunology
13
Breast cancer research and treatment
14
Vector borne and zoonotic diseases (Larchmont, N.Y.)
15
Vector borne and zoonotic diseases (Larchmont, N.Y.)
16
Annals of translational medicine
17
Clinical genitourinary cancer
18
Breast cancer research and treatment
19
International journal of cancer
20
Acta oncologica (Stockholm, Sweden)
21
Acta oncologica (Stockholm, Sweden)
22
Breast cancer research and treatment
23
Journal of clinical oncology : official journal of the American Society of Clinical Oncology
24
Health care for women international
25
Cancer treatment revie

BMJ open
1005
JCO precision oncology
1006
British journal of cancer
1007
Clinical cancer research : an official journal of the American Association for Cancer Research
1008
BMJ open
1009
Genome medicine
1010
Medical oncology (Northwood, London, England)
1011
Breast cancer research and treatment
1012
European urology
1013
Cancers
1014
Supportive care in cancer : official journal of the Multinational Association of Supportive Care in Cancer
1015
Thrombosis research
1016
The pharmacogenomics journal
1017
F1000Research
1018
Nature reviews. Clinical oncology
1019
Molecular oncology
1020
Breast (Edinburgh, Scotland)
1021
European journal of cancer (Oxford, England : 1990)
1022
ESMO open
1023
Lakartidningen
1024
ESMO open
1025
European journal of cancer (Oxford, England : 1990)
1026
Chinese clinical oncology
1027
Acta oncologica (Stockholm, Sweden)
1028
The oncologist
1029
Supportive care in cancer : official journal of the Multinational Association of Supportive Care in Cancer
1030
The oncol

2308
Acta oto-laryngologica
2309
Acta oto-laryngologica
2310
Addiction (Abingdon, England)
2311
JAMA oncology
2312
Case reports in oncology
2313
Lakartidningen
2314
Cancer epidemiology, biomarkers & prevention : a publication of the American Association for Cancer Research, cosponsored by the American Society of Preventive Oncology
2315
Lakartidningen
2316
Cancers
2317
Melanoma research
2318
Acta oncologica (Stockholm, Sweden)
2319
Nature communications
2320
The American journal of pathology
2321
Journal for immunotherapy of cancer
2322
Melanoma research
2323
Cancers
2324
Acta oncologica (Stockholm, Sweden)
2325
Case reports in oncology
2326
Cancers
2327
Journal of clinical oncology : official journal of the American Society of Clinical Oncology
2328
Case reports in oncology
2329
Nature communications
2330
International immunopharmacology
2331
Cell death & disease
2332
World journal of surgical oncology
2333
Acta oncologica (Stockholm, Sweden)
2334
Annals of surgical oncology
2335
BMC 

In order for journals to have the same format, we want to convert the journals to lower case.

In [45]:
# lower case
hcp_df_pubmed['journal'] = hcp_df_pubmed["journal"].str.lower()
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_date,title
0,Aglaia Schiza,6,the immune microenvironment important modulato...,"Aglaia Schiza, Viktoria Thurfjell, Axel Tullbe...","european journal of cancer (oxford, england : ...","['Ductal carcinoma in situ', 'Radiotherapy', '...",2022-03-04,tumourinfiltrating lymphocytes add prognostic ...
1,Aglaia Schiza,6,trastuzumab emtansine (t-dm1) presently approv...,"Thomas Hatschek, Theodoros Foukakis, Judith Bj...",jama oncology,[],2021-06-25,neoadjuvant trastuzumab pertuzumab docetaxel v...
2,Aglaia Schiza,6,this study analyzes potential stromal platelet...,"Carina Strell, Dick Folkvaljon, Erik Holmberg,...",clinical cancer research : an official journal...,[],2021-05-07,high pdgfrb expression predicts resistance rad...
3,Aglaia Schiza,6,there conflicting results potential role her2-...,"Aglaia Schiza, Davide Mauri, Irma Fredriksson,...",breast cancer research and treatment,"['Adjuvant', 'Breast cancer', 'Endocrine treat...",2020-12-02,predictive role her2status effectiveness endoc...
4,Aglaia Schiza,6,the purpose evaluate potential diffusion-weigh...,"Aglaia Schiza, Sandra Irenaeus, Francisco Niet...",scientific reports,[],2019-12-04,evaluation diffusionweighted mri fdgpetct asse...
...,...,...,...,...,...,...,...,...
2659,Anna Nyberg,33,to provide practicing nurse anaesthetists evid...,"Roland Nilsson, Chatarina Björdal, Matts Ander...",journal of clinical nursing,[],2005-01-27,health risks occupational exposure volatile an...
2660,Mikael Wallander,1,while recent randomised phase iii trials trifl...,"Mikael Wallander, Bo Rolander, Elisabeth Lundq...",journal of gastrointestinal oncology,"['Trifluridine and tiperacil (TAS-102)', 'chem...",2020-09-22,real world aspects palliative trifluridine plu...
2661,Frida Jakobsson,3,treating localized prostate cancer (pc) combin...,"Johan Olsén, Dalia Estefan, Antonios Valachis,...",journal of contemporary brachytherapy,"['HDR', 'boost', 'brachytherapy', 'hypo-fracti...",2022-03-03,predicting toxicity caused highdoserate brachy...
2662,Frida Jakobsson,3,the benefit imaging follow-up setting high-ris...,"Ylva Naeser, Hildur Helgadottir, Johan Hansson...",cancers,"['X-ray computed', 'follow-up studies', 'melan...",2022-02-26,quality life first year followup randomized mu...


### `keywords`

Let us take a look at the different keywords.

In [46]:
print_index_and_value(hcp_df_pubmed, 'keywords')

0
['Ductal carcinoma in situ', 'Radiotherapy', 'Ttumour infiltrating lymphocytes']
1
[]
2
[]
3
['Adjuvant', 'Breast cancer', 'Endocrine treatment', 'HER2-status', 'Hormone receptor-positive', 'Postmenopausal']
4
[]
5
['AdCD40L', 'gene therapy', 'immunotherapy', 'irradiation', 'malignant melanoma']
6
['X-ray computed', 'follow-up studies', 'melanoma', 'positron emission tomography computed tomography', 'prospective studies', 'quality of life', 'randomized controlled trial', 'tomography']
7
[]
8
[]
9
[]
10
['Age at diagnosis', 'Gene expression', 'Mutational signatures', 'Mutations', 'PD-L1', 'Patient outcome', 'TILs', 'Triple-negative breast cancer']
11
[]
12
['Checkpoint inhibitor', 'influenza vaccination', 'progression-free survival', 'side effects', 'solid cancer']
13
['Breast cancer', 'Immunogenicity', 'Influenza', 'Trastuzumab', 'Vaccination']
14
['Drama R.U', 'Greece', 'field studies', 'mosquito species']
15
['Aedes albopictus', 'Culex tritaeniorhynchus', 'Greece', 'entomological s

[]
734
[]
735
['biomarker', 'breast cancer', 'initiation', 'microenvironment', 'obesity', 'overweight', 'progression']
736
['attitude of health personnel', 'eHealth', 'general practitioners', 'survey']
737
['Coronary artery bypass surgery', 'Heart failure', 'Natriuretic peptide', 'Postoperative care']
738
['Coronary artery bypass surgery', 'Glutamic acid', 'Heart failure', 'Natriuretic peptide', 'Postoperative care']
739
['MR-proADM', 'MR-proANP', 'cardiac surgery', 'hs-CRP', 'sP-selectin']
740
[]
741
['Advanced heart failure', 'Destination therapy', 'Guideline-directed medical therapy', 'HeartMate 3', 'Left ventricular assist device', 'Mechanical circulatory support', 'Randomized controlled trial']
742
[]
743
['aortic valve disease', 'cardiac function', 'heart failure', 'natriuretic peptide', 'surgery-valve']
744
['aortic valve disease', 'heart failure', 'surgery-valve']
745
[]
746
['cardiac surgery', 'copeptin', 'kinetics', 'perioperative care']
747
[]
748
['Cycloid psychosis', 'acut

['4-1BBL', 'CD40L', 'PD-1', 'PD-L1', 'TIM-3', 'checkpoint inhibitor', 'immune checkpoint blockade', 'oncolytic adenovirus']
1794
[]
1795
[]
1796
[]
1797
['Cancer', 'Mortality', 'Pregnancy', 'Pregnancy-associated cancer', 'Survival']
1798
['Cancer patients', 'Location of death', 'Palliative care', 'Register study', 'Socioeconomy']
1799
['Checkpoint inhibitor', 'influenza vaccination', 'progression-free survival', 'side effects', 'solid cancer']
1800
[]
1801
['Diagnostics', 'Entrectinib', 'Gene fusion', 'Larotrectinib', 'Lung adenocarcinoma', 'NTRK']
1802
['CT', 'Cutaneous malignant melanoma', 'FDG-PET/CT', 'Follow-up']
1803
[]
1804
['BRAF inhibitor', 'MEK inhibitor', 'V600E mutation', 'chemotherapy', 'low-grade serous ovarian cancer', 'next-generation sequencing', 'surgery', 'targeted therapy']
1805
['Combination immunotherapy', 'DC vaccination', 'adoptive tumor-infiltrating lymphocyte therapy', 'immune checkpoint inhibitor resistance', 'tumor lysate']
1806
['Cancer', 'Cessation', 'Onco

We can see from above that some keywords contain an apostrophe and are therefore in double quotes. We take a closer look at keywords strings that contain double quotes.

In [47]:
# Take a look at all keywords where " shows up
for i in range(len(hcp_df_pubmed)):
    if '"' in hcp_df_pubmed['keywords'][i]:
        print(i)
        print(hcp_df_pubmed['keywords'][i])

177
['Maternal mortality and morbidity', 'delivery', 'developing countries', 'education', 'information technology', 'labor', "women's health issues"]
305
['everyday life', 'multifaceted approach to change', 'neurosurgery', 'nursing', "patients' experience", 'stroke']
1543
["RAB6A'", 'WTH3', 'breast neoplasm', 'endocrine therapy', 'hormone receptors']
2293
['bladder', 'paraneurons', 'urethra', 'urothelium', "von Brunn's nest"]


We take a closer look at keywords strings that contain an asterisk.

In [48]:
# Take look at all keywords where * shows up
for i in range(len(hcp_df_pubmed)):
    if '*' in hcp_df_pubmed['keywords'][i]:
        print(i)
        print(hcp_df_pubmed['keywords'][i])

1895
['HLA-A*02', 'HLA-E', 'HLA-G', 'immunohistochemistry', 'ovarian cancer', 'prognosis', 'serous adenocarcinoma']
1902
['HLA-A*02 genotype', 'base of tongue cancer', 'clinical outcome', 'human papillomavirus (HPV) oropharyngeal squamous cell carcinoma', 'tonsillar cancer']


We see that there are no keywords strings that contain square brackets.

In [49]:
# Take a look at all keywords where ][ shows up
for i in range(len(hcp_df_pubmed)):
    if '][' in hcp_df_pubmed['keywords'][i]:
        print(i)
        print(hcp_df_pubmed['keywords'][i]) # no papers with it

We see that the keywords are given as strings, either as `[]` or `['keyword_1', ..., 'keyword_n']`. Sometimes, a keyword itself contains is a word with an apostrophe so that the keyword is in double quotes. We would like to remove the strings `[]` and convert the other keyword strings to a string of the form `keyword_1, ..., keyword_n`, i.e., remove the brackets, single quotes and double quotes.

The helper function `clean_keywords()` does this for us.

In [50]:
hcp_df_pubmed = clean_keywords(hcp_df_pubmed)
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_date,title
0,Aglaia Schiza,6,the immune microenvironment important modulato...,"Aglaia Schiza, Viktoria Thurfjell, Axel Tullbe...","european journal of cancer (oxford, england : ...","ductal carcinoma in situ, radiotherapy, ttumou...",2022-03-04,tumourinfiltrating lymphocytes add prognostic ...
1,Aglaia Schiza,6,trastuzumab emtansine (t-dm1) presently approv...,"Thomas Hatschek, Theodoros Foukakis, Judith Bj...",jama oncology,,2021-06-25,neoadjuvant trastuzumab pertuzumab docetaxel v...
2,Aglaia Schiza,6,this study analyzes potential stromal platelet...,"Carina Strell, Dick Folkvaljon, Erik Holmberg,...",clinical cancer research : an official journal...,,2021-05-07,high pdgfrb expression predicts resistance rad...
3,Aglaia Schiza,6,there conflicting results potential role her2-...,"Aglaia Schiza, Davide Mauri, Irma Fredriksson,...",breast cancer research and treatment,"adjuvant, breast cancer, endocrine treatment, ...",2020-12-02,predictive role her2status effectiveness endoc...
4,Aglaia Schiza,6,the purpose evaluate potential diffusion-weigh...,"Aglaia Schiza, Sandra Irenaeus, Francisco Niet...",scientific reports,,2019-12-04,evaluation diffusionweighted mri fdgpetct asse...
...,...,...,...,...,...,...,...,...
2659,Anna Nyberg,33,to provide practicing nurse anaesthetists evid...,"Roland Nilsson, Chatarina Björdal, Matts Ander...",journal of clinical nursing,,2005-01-27,health risks occupational exposure volatile an...
2660,Mikael Wallander,1,while recent randomised phase iii trials trifl...,"Mikael Wallander, Bo Rolander, Elisabeth Lundq...",journal of gastrointestinal oncology,"trifluridine and tiperacil (tas-102), chemothe...",2020-09-22,real world aspects palliative trifluridine plu...
2661,Frida Jakobsson,3,treating localized prostate cancer (pc) combin...,"Johan Olsén, Dalia Estefan, Antonios Valachis,...",journal of contemporary brachytherapy,"hdr, boost, brachytherapy, hypo-fractionation,...",2022-03-03,predicting toxicity caused highdoserate brachy...
2662,Frida Jakobsson,3,the benefit imaging follow-up setting high-ris...,"Ylva Naeser, Hildur Helgadottir, Johan Hansson...",cancers,"x-ray computed, follow-up studies, melanoma, p...",2022-02-26,quality life first year followup randomized mu...


PubMed is a database entirely dedicated to medicine. So, the articles published on PubMed belong to the field of medicine. Google Scholar, by contrast, is not entirely concerned with medicine. To ensure that a paper scraped from Google Scholar is actually related to medicine and thus can come from an HCP, we would like to use the keywords provided in the PubMed data to later filter Google Scholar papers by these keywords. 

For this reason, we now want to create a list of keywords from PubMed that we can later use when cleaning the Google Scholar data.

The helper function `combine_keywords()` creates this list of keywords for us.

In [51]:
keywords_list = combine_keywords(hcp_df_pubmed)
keywords_list 

['gonadotropins',
 'neuregulins',
 'symptom cluster',
 'cardiovascular outcomes',
 'pdgfr',
 'neurochemistry',
 'structural equation modelling',
 'whole genome sequencing',
 'cultural adaptation',
 'freshwater toxicology',
 'mib1',
 'salivary glands',
 'overall survival',
 'hypoxia inducible factor (hif)',
 'treatment planning',
 'braf',
 'kymriah (tisagenlecleucel',
 'guidelines',
 'sequencing',
 'socioeconomy',
 'surrogate marker',
 'steroid prodrug',
 'randomized trials',
 'male',
 'nursing care',
 'lung cancer',
 'esas',
 'medical treatment',
 'tumour microenvironment',
 'technology education',
 'ct',
 'emt phenotypes',
 'comp',
 'warfarin',
 'tumour size',
 'semi-structured interviews',
 'serum-comp',
 'gmp',
 'thymidine kinase',
 'hormone receptor positive breast cancer',
 'pathogen vector',
 'proliferation',
 'secondary analysis',
 'national expenditures on cancer',
 'smoothing',
 'ontario',
 'oncology education and training',
 'tumour clonality',
 'child healthcare',
 'glycopep

It can be seen that the list contains mainly medicine-related keywords. However, we can also observe the presence of keywords not directly connected to medicine, e.g., education, vocabulary, young etc.

In [52]:
len(keywords_list)

2786

We save the list of keywords now.

In [62]:
# Save the keywords 
with open('keywords_list.txt', 'w') as f:
    f.write(json.dumps(keywords_list))

# Read in the keywords
with open('keywords_list.txt', 'r') as f:
    keywords_list = json.loads(f.read())

### `publication_date`

We want to keep only the publication year and therefore rename `publication_date` to `publication_year`.

In [53]:
hcp_df_pubmed = hcp_df_pubmed.rename(columns={'publication_date': 'publication_year'})

In [54]:
hcp_df_pubmed['publication_year'] = hcp_df_pubmed['publication_year'].str[:4]
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_year,title
0,Aglaia Schiza,6,the immune microenvironment important modulato...,"Aglaia Schiza, Viktoria Thurfjell, Axel Tullbe...","european journal of cancer (oxford, england : ...","ductal carcinoma in situ, radiotherapy, ttumou...",2022,tumourinfiltrating lymphocytes add prognostic ...
1,Aglaia Schiza,6,trastuzumab emtansine (t-dm1) presently approv...,"Thomas Hatschek, Theodoros Foukakis, Judith Bj...",jama oncology,,2021,neoadjuvant trastuzumab pertuzumab docetaxel v...
2,Aglaia Schiza,6,this study analyzes potential stromal platelet...,"Carina Strell, Dick Folkvaljon, Erik Holmberg,...",clinical cancer research : an official journal...,,2021,high pdgfrb expression predicts resistance rad...
3,Aglaia Schiza,6,there conflicting results potential role her2-...,"Aglaia Schiza, Davide Mauri, Irma Fredriksson,...",breast cancer research and treatment,"adjuvant, breast cancer, endocrine treatment, ...",2020,predictive role her2status effectiveness endoc...
4,Aglaia Schiza,6,the purpose evaluate potential diffusion-weigh...,"Aglaia Schiza, Sandra Irenaeus, Francisco Niet...",scientific reports,,2019,evaluation diffusionweighted mri fdgpetct asse...
...,...,...,...,...,...,...,...,...
2659,Anna Nyberg,33,to provide practicing nurse anaesthetists evid...,"Roland Nilsson, Chatarina Björdal, Matts Ander...",journal of clinical nursing,,2005,health risks occupational exposure volatile an...
2660,Mikael Wallander,1,while recent randomised phase iii trials trifl...,"Mikael Wallander, Bo Rolander, Elisabeth Lundq...",journal of gastrointestinal oncology,"trifluridine and tiperacil (tas-102), chemothe...",2020,real world aspects palliative trifluridine plu...
2661,Frida Jakobsson,3,treating localized prostate cancer (pc) combin...,"Johan Olsén, Dalia Estefan, Antonios Valachis,...",journal of contemporary brachytherapy,"hdr, boost, brachytherapy, hypo-fractionation,...",2022,predicting toxicity caused highdoserate brachy...
2662,Frida Jakobsson,3,the benefit imaging follow-up setting high-ris...,"Ylva Naeser, Hildur Helgadottir, Johan Hansson...",cancers,"x-ray computed, follow-up studies, melanoma, p...",2022,quality life first year followup randomized mu...


### `num_articles`

In the above data cleaning steps, some papers were removed, e.g., when the name in `hcp_name` did not match any of the names given in `authors`. 

We now want to update the number of articles for each HCP provided in `num_articles`. 

The helper function `update_num_articles()` does this for us.

In [55]:
hcp_df_pubmed = update_num_articles(hcp_df_pubmed)
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_year,title
0,Aglaia Schiza,6,the immune microenvironment important modulato...,"Aglaia Schiza, Viktoria Thurfjell, Axel Tullbe...","european journal of cancer (oxford, england : ...","ductal carcinoma in situ, radiotherapy, ttumou...",2022,tumourinfiltrating lymphocytes add prognostic ...
1,Aglaia Schiza,6,trastuzumab emtansine (t-dm1) presently approv...,"Thomas Hatschek, Theodoros Foukakis, Judith Bj...",jama oncology,,2021,neoadjuvant trastuzumab pertuzumab docetaxel v...
2,Aglaia Schiza,6,this study analyzes potential stromal platelet...,"Carina Strell, Dick Folkvaljon, Erik Holmberg,...",clinical cancer research : an official journal...,,2021,high pdgfrb expression predicts resistance rad...
3,Aglaia Schiza,6,there conflicting results potential role her2-...,"Aglaia Schiza, Davide Mauri, Irma Fredriksson,...",breast cancer research and treatment,"adjuvant, breast cancer, endocrine treatment, ...",2020,predictive role her2status effectiveness endoc...
4,Aglaia Schiza,6,the purpose evaluate potential diffusion-weigh...,"Aglaia Schiza, Sandra Irenaeus, Francisco Niet...",scientific reports,,2019,evaluation diffusionweighted mri fdgpetct asse...
...,...,...,...,...,...,...,...,...
2659,Anna Nyberg,33,to provide practicing nurse anaesthetists evid...,"Roland Nilsson, Chatarina Björdal, Matts Ander...",journal of clinical nursing,,2005,health risks occupational exposure volatile an...
2660,Mikael Wallander,1,while recent randomised phase iii trials trifl...,"Mikael Wallander, Bo Rolander, Elisabeth Lundq...",journal of gastrointestinal oncology,"trifluridine and tiperacil (tas-102), chemothe...",2020,real world aspects palliative trifluridine plu...
2661,Frida Jakobsson,3,treating localized prostate cancer (pc) combin...,"Johan Olsén, Dalia Estefan, Antonios Valachis,...",journal of contemporary brachytherapy,"hdr, boost, brachytherapy, hypo-fractionation,...",2022,predicting toxicity caused highdoserate brachy...
2662,Frida Jakobsson,3,the benefit imaging follow-up setting high-ris...,"Ylva Naeser, Hildur Helgadottir, Johan Hansson...",cancers,"x-ray computed, follow-up studies, melanoma, p...",2022,quality life first year followup randomized mu...


### Create new column `scraped_from`

In order to indicate that the papers in this data frame were scraped from PubMed, we create a new column `scraped_from` with the value `pm` for all the papers.

In [56]:
hcp_df_pubmed['scraped_from'] = 'pm'
hcp_df_pubmed

Unnamed: 0,hcp_name,num_articles,abstract,authors,journal,keywords,publication_year,title,scraped_from
0,Aglaia Schiza,6,the immune microenvironment important modulato...,"Aglaia Schiza, Viktoria Thurfjell, Axel Tullbe...","european journal of cancer (oxford, england : ...","ductal carcinoma in situ, radiotherapy, ttumou...",2022,tumourinfiltrating lymphocytes add prognostic ...,pm
1,Aglaia Schiza,6,trastuzumab emtansine (t-dm1) presently approv...,"Thomas Hatschek, Theodoros Foukakis, Judith Bj...",jama oncology,,2021,neoadjuvant trastuzumab pertuzumab docetaxel v...,pm
2,Aglaia Schiza,6,this study analyzes potential stromal platelet...,"Carina Strell, Dick Folkvaljon, Erik Holmberg,...",clinical cancer research : an official journal...,,2021,high pdgfrb expression predicts resistance rad...,pm
3,Aglaia Schiza,6,there conflicting results potential role her2-...,"Aglaia Schiza, Davide Mauri, Irma Fredriksson,...",breast cancer research and treatment,"adjuvant, breast cancer, endocrine treatment, ...",2020,predictive role her2status effectiveness endoc...,pm
4,Aglaia Schiza,6,the purpose evaluate potential diffusion-weigh...,"Aglaia Schiza, Sandra Irenaeus, Francisco Niet...",scientific reports,,2019,evaluation diffusionweighted mri fdgpetct asse...,pm
...,...,...,...,...,...,...,...,...,...
2659,Anna Nyberg,33,to provide practicing nurse anaesthetists evid...,"Roland Nilsson, Chatarina Björdal, Matts Ander...",journal of clinical nursing,,2005,health risks occupational exposure volatile an...,pm
2660,Mikael Wallander,1,while recent randomised phase iii trials trifl...,"Mikael Wallander, Bo Rolander, Elisabeth Lundq...",journal of gastrointestinal oncology,"trifluridine and tiperacil (tas-102), chemothe...",2020,real world aspects palliative trifluridine plu...,pm
2661,Frida Jakobsson,3,treating localized prostate cancer (pc) combin...,"Johan Olsén, Dalia Estefan, Antonios Valachis,...",journal of contemporary brachytherapy,"hdr, boost, brachytherapy, hypo-fractionation,...",2022,predicting toxicity caused highdoserate brachy...,pm
2662,Frida Jakobsson,3,the benefit imaging follow-up setting high-ris...,"Ylva Naeser, Hildur Helgadottir, Johan Hansson...",cancers,"x-ray computed, follow-up studies, melanoma, p...",2022,quality life first year followup randomized mu...,pm


### Save cleaned PubMed data

In [77]:
# Create results folder
route0 = "../processed_data"

if not os.path.exists(route0):
    os.mkdir(route0)

print(f"saving file corresponding to results_pm.csv")
hcp_df_pubmed.to_csv(f"{route0}/results_pm.csv", index = False)

saving file corresponding to results_pm.csv


In [78]:
# Read in data scraped from PubMed
hcp_df_pubmed = pd.read_csv(f"{route0}/results_pm.csv")
hcp_df_pubmed.shape

(2664, 9)