# Preparation of Google Scholar data

This notebook deals with the Google Scholar data. It first takes a look at the basic structure of the data. Then, it analyzes the Google Scholar data and prepares it.

In [1]:
import numpy as np
import pandas as pd

import requests
import time

import json
import os

import pandas as pd
from bs4 import BeautifulSoup
import lxml

from gensim.parsing.preprocessing import remove_stopwords

#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)

## Define helper functions

In [2]:
def print_index_and_value(data, variable):
    """
    Print the index and value of a variable in a data frame.
    """
    for i in range(len(data)):
        print(i)
        print(data[variable][i])

In [3]:
def clean_titles(data, variable, gs = False):
    """
    Clean titles.
    """
    for i in range(len(data)):
        entry = data[variable][i]
        if pd.notnull(data.at[i, variable]):
            entry = remove_stopwords(entry)
            
            if gs:
                entry = entry.replace("[BOOK][B] ", "")\
                                .replace("[CITATION][C] ", "")\
                                    .replace("[HTML][HTML] ", "")\
                                        .replace("[PDF][PDF] ", "")\
                                            .replace("\xa0…", "")

            data.at[i, variable] = entry.lower()
                
    data[variable] = data[variable].str.replace('[^\w\s]','')
    
    return data

In [4]:
def filter_by_keywords(data):
    """
    Filter papers by keywords.
    """
    mask = data.title.apply(lambda x: any(item for item in keywords_list if item in x))
    data = data[mask].reset_index(drop=True)
    
    return data

In [5]:
def truncate_names_1(data, column):
    """
    Truncate names.
    """
    for i in range(len(data)):
        names_list = data[column][i].split(", ")
        names_list_truncated = []
        for j in range(len(names_list)):
            names_list_split = names_list[j].lstrip().rstrip().replace("-", " ").split(" ")
            name = names_list_split[0] + ' ' + names_list_split[-1]
            names_list_truncated += [name]
            
        names_truncated = ", ".join(names_list_truncated) 
            
        data.at[i, column] = names_truncated
        
    return data

In [6]:
def clean_authors(data, pubmed=False, gs=False):
    """
    Clean the authors column (remove brackets and quotes).
    """
    if pubmed:
        for i in range(len(data)):
                authors = data['authors'].iloc[i].replace("[", "")\
                                                    .replace("]", "")\
                                                        .replace("('", "")\
                                                            .replace("')", "")\
                                                                .replace("', '", " ")\
                                                                    .replace("(\'", "")\
                                                                        .replace("\')", "")\
                                                                            .replace('("', '')\
                                                                                .replace('")', '')\
                                                                                    .replace("\', \'", " ")\
                                                                                        .replace('\', "', ' ')\
                                                                                            .replace('", \'', ' ')\
                                                                                                .replace("\'", "")
        
                authors = authors.split(", ")
        
                index_list = []
                for j in range(len(authors)):
                    if authors[j] == "(None":
                        index_list += [j] + [j+1]
        
                index_list = list(set(index_list))
                full_index_list = list(range(len(authors)))
                index_list_choose = list(set(full_index_list) - set(index_list))
                authors = [authors[k] for k in index_list_choose]
                
                authors = ", ".join(authors)
                print(authors)
            
                data.at[i, 'authors'] = authors
        
        # remove the rows where no authors are left
        data = data[data['authors'] != ''].reset_index(drop=True)
        
    if gs:
        for i in range(len(data)):
            data.at[i, 'authors'] = data['authors'][i].strip()\
                                                        .replace("\xa0", "")\
                                                            .replace("…", "")\
                                                                . replace("'", "")
    
    return data

In [7]:
def truncate_names_2(data, column):
    """
    Truncate names.
    """
    for i in range(len(data)):
        names_list = data[column][i].split(", ")
        names_list_truncated = []
        for j in range(len(names_list)):
            names_list_split = names_list[j].lstrip().rstrip().replace("-", " ").split(" ")
            name = names_list_split[0][0] + ' ' + names_list_split[-1]
            names_list_truncated += [name]
            
        names_truncated = ", ".join(names_list_truncated) 
            
        data.at[i, column] = names_truncated
        
    return data

In [8]:
def remove_wrongly_scraped_papers(data):
    """
    Remove irrelevant papers.
    """
    index_list = []
    
    for i in range(len(data)):
        if data['hcp_name_short'][i] in data['authors'][i]:
            index_list += [i]
    
    data = data.iloc[index_list, :].reset_index(drop=True)
    
    return data

In [9]:
def split_publication(data):
    
    relevant_years = [str(i) for i in range(1950,2023)]
    
    for i in range(len(data)):
        if data['publications'].iloc[i][-5:-1] in relevant_years:
            data.at[i, 'publication_year'] = data['publications'].iloc[i][-5:-1]
            data.at[i, 'journal'] = data['publications'].iloc[i][:-7]
        else:
            data.at[i, 'publication_year'] = '' 
            data.at[i, 'journal'] = data['publications'].iloc[i]
         
        if data['journal'].iloc[i][-2:] == '\xa0…':
            data.at[i, 'journal'] = data['journal'].iloc[i][:-2]
            
        if data['journal'].iloc[i][:2] == '…\xa0':
            data.at[i, 'journal'] = data['journal'].iloc[i][2:]
            
        if data['journal'].iloc[i] == '…':
            data.at[i, 'journal'] = ''
            
        
    # drop column 'publications'
    data = data.reindex(columns=['hcp_name', 'num_articles', 'title', 'authors', 'journal', 
                                 'publication_year', 'snippet', 'hcp_name_short'])
            
    return data

In [10]:
def clean_abstract(data, gs=False):
    """
    Clean abtracts.
    """
    for i in range(len(data)):
        entry = data['abstract'][i]
        if pd.notnull(data.at[i,'abstract']):
            entry = remove_stopwords(entry)
            if gs:
                entry = entry.replace(" …", "")
            data.at[i, "abstract"] = entry.lower().replace('[^\w\s]','')
            
    return data

In [11]:
def update_num_articles(data):
    """
    Update the number of articles.
    """
    for i in range(len(data)):
        doctor = data['hcp_name'][i]
        num_papers = len(data[data['hcp_name'] == doctor])
        data.at[i, 'num_articles'] = num_papers
        
    return data

## Load data

In [12]:
# Read in data scraped from Google Scholar
hcp_df_gs = pd.read_csv(f"../../0_raw_data/web_scraping_data/results_queries_gs.csv") 
hcp_df_gs.shape

(3778, 7)

In [13]:
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,title_link,authors,publications,snippet
0,Adel Bader Hamdalla,0,,,,,
1,Adel Bader Hamdalla,0,,,,,
2,Adel Bader Hamdalla,0,,,,,
3,Aglaia Schiza,10,"Neoadjuvant trastuzumab, pertuzumab, and docet...",https://jamanetwork.com/journals/jamaoncology/...,"T Hatschek, T Foukakis, J Bjöhle, T Lekberg…","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...
4,Aglaia Schiza,10,[HTML][HTML] Adenovirus-mediated CD40L gene tr...,https://translational-medicine.biomedcentral.c...,"A Schiza, J Wenthe, S Mangsbo…","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...
...,...,...,...,...,...,...,...
3773,Frida Jakobsson,30,What's the big deal?,https://apo.org.au/node/312896,"D Cooper, O Rieger",2021,The dominant mode by which research libraries ...
3774,Frida Jakobsson,30,[PDF][PDF] Hur kan vi veta vad vi gör innan vi...,https://www.diva-portal.org/smash/get/diva2:11...,"A Öjelid, D Öjelid",2017,För att hantera den moderna tidens komplexa ho...
3775,Frida Jakobsson,30,What's the Big Deal? How Researchers Are Navig...,https://digitalcommons.unl.edu/scholcom/192/,"D Cooper, OY Rieger",2021,The dominant mode by which research libraries ...
3776,Frida Jakobsson,30,[PDF][PDF] Kaffehus 2.0,https://expo.jmg.gu.se/wp-content/uploads/195-...,L Olsson,expo.jmg.gu.se,… Moderaterna kontaktade jag genom att mejla d...


## Initial preparatory steps

The following things should be done in the initial preparatory steps and the cleaning process:
* only keep relevant variables 
* delete rows without title 
* use keywords from pubmed to filter out irrelevant papers
* bring hcp_name into name format 
* num_articles are updated
* title: lower case, remove things like `[PDF][PDF]` 
* authors: clean them to first letter of first name format, which is already the case, and delete middle names
* publications: split into two columns 'journal' and 'publication_year'
* journal: lower case 
* 'publication_year': nothing
* snippet: rename to abstract before merging; lower case, remove stopwords, etc.
* add new column 'scraped_from' with value 'gs'

#### Drop irrelevant variables

In [14]:
hcp_df_gs = hcp_df_gs[['hcp_name', 'num_articles', 'title', 'authors', 'publications', 'snippet']] # snippet: abstract
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet
0,Adel Bader Hamdalla,0,,,,
1,Adel Bader Hamdalla,0,,,,
2,Adel Bader Hamdalla,0,,,,
3,Aglaia Schiza,10,"Neoadjuvant trastuzumab, pertuzumab, and docet...","T Hatschek, T Foukakis, J Bjöhle, T Lekberg…","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...
4,Aglaia Schiza,10,[HTML][HTML] Adenovirus-mediated CD40L gene tr...,"A Schiza, J Wenthe, S Mangsbo…","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...
...,...,...,...,...,...,...
3773,Frida Jakobsson,30,What's the big deal?,"D Cooper, O Rieger",2021,The dominant mode by which research libraries ...
3774,Frida Jakobsson,30,[PDF][PDF] Hur kan vi veta vad vi gör innan vi...,"A Öjelid, D Öjelid",2017,För att hantera den moderna tidens komplexa ho...
3775,Frida Jakobsson,30,What's the Big Deal? How Researchers Are Navig...,"D Cooper, OY Rieger",2021,The dominant mode by which research libraries ...
3776,Frida Jakobsson,30,[PDF][PDF] Kaffehus 2.0,L Olsson,expo.jmg.gu.se,… Moderaterna kontaktade jag genom att mejla d...


#### Drop rows without authors

In [15]:
print("Count total NaN in each column:")
hcp_df_gs.isnull().sum()

Count total NaN in each column:


hcp_name          0
num_articles      0
title           214
authors         214
publications    214
snippet         258
dtype: int64

In [16]:
hcp_df_gs[hcp_df_gs['authors'].isnull()]

Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet
0,Adel Bader Hamdalla,0,,,,
1,Adel Bader Hamdalla,0,,,,
2,Adel Bader Hamdalla,0,,,,
20,Aglaia Schiza,17,,,,
22,Agneta Nordin Danfors,1,,,,
...,...,...,...,...,...,...
3704,Ylva Holmgren Stenlund,1,,,,
3743,Göran Carlstedt,8,,,,
3744,Göran Carlstedt,8,,,,
3746,Mikael Wallander,1,,,,


These are all doctors with less than 21 articles. These rows need to be dropped.

In [17]:
hcp_df_gs[hcp_df_gs['snippet'].isnull()]

Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet
0,Adel Bader Hamdalla,0,,,,
1,Adel Bader Hamdalla,0,,,,
2,Adel Bader Hamdalla,0,,,,
15,Aglaia Schiza,17,[CITATION][C] Evidence on the occurrence of br...,"H Lindman, A Haji, M Jernling…","European Journal of …, 2018",
16,Aglaia Schiza,17,[CITATION][C] Evidence on the occurrence of br...,"H Lindman, A Haji, M Jernling…","European Journal of …, 2018",
...,...,...,...,...,...,...
3743,Göran Carlstedt,8,,,,
3744,Göran Carlstedt,8,,,,
3746,Mikael Wallander,1,,,,
3747,Mikael Wallander,1,,,,


There are 44 papers where the only missing information is the snippet. These rows are retained.

In [18]:
# Drop all rows where authors is NaN 
hcp_df_gs = hcp_df_gs.dropna(subset = ['authors']).reset_index(drop = True)
hcp_df_gs.shape

(3564, 6)

In [19]:
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet
0,Aglaia Schiza,10,"Neoadjuvant trastuzumab, pertuzumab, and docet...","T Hatschek, T Foukakis, J Bjöhle, T Lekberg…","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...
1,Aglaia Schiza,10,[HTML][HTML] Adenovirus-mediated CD40L gene tr...,"A Schiza, J Wenthe, S Mangsbo…","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...
2,Aglaia Schiza,10,High PDGFRb Expression Predicts Resistance to ...,"C Strell, D Folkvaljon, E Holmberg, A Schiza…","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...
3,Aglaia Schiza,10,[HTML][HTML] Evaluation of diffusion-weighted ...,"A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog…","Scientific reports, 2019",The purpose was to evaluate the potential of d...
4,Aglaia Schiza,10,[HTML][HTML] Local irradiation does not enhanc...,"S Irenaeus, A Schiza, SM Mangsbo, J Wenthe…","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...
...,...,...,...,...,...,...
3559,Frida Jakobsson,30,What's the big deal?,"D Cooper, O Rieger",2021,The dominant mode by which research libraries ...
3560,Frida Jakobsson,30,[PDF][PDF] Hur kan vi veta vad vi gör innan vi...,"A Öjelid, D Öjelid",2017,För att hantera den moderna tidens komplexa ho...
3561,Frida Jakobsson,30,What's the Big Deal? How Researchers Are Navig...,"D Cooper, OY Rieger",2021,The dominant mode by which research libraries ...
3562,Frida Jakobsson,30,[PDF][PDF] Kaffehus 2.0,L Olsson,expo.jmg.gu.se,… Moderaterna kontaktade jag genom att mejla d...


#### Duplicates

In [20]:
print(hcp_df_gs.duplicated().sum())
hcp_df_gs[hcp_df_gs.duplicated()]

174


Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet
13,Aglaia Schiza,17,[CITATION][C] Evidence on the occurrence of br...,"H Lindman, A Haji, M Jernling…","European Journal of …, 2018",
14,Aglaia Schiza,17,[CITATION][C] Evidence on the occurrence of br...,"H Lindman, A Haji, M Jernling…","European Journal of …, 2018",
34,Ana Bosch Campos,13,"Department of Pathology, Memorial Sloan Ketter...","F Pareja, JS Reis-Filho","cancer, 2017",Triple-negative breast cancers — a panoply of ...
84,Ann Charlotte Dreifaldt,20,[PDF][PDF] Environmental Medicine I Article,"L Hardell, B Van Bavel, G Lindström, M Carlberg…",Citeseer,Materials and Methods Incident cases with test...
271,Birgitta Lind,20,Human immunodeficiency virus type 1 biological...,"EM Fenyö, J Esbjörnsson, P Medstrand…","Journal of internal …, 2011",… Eva Maria Fenyö would especially like to tha...
...,...,...,...,...,...,...
3385,Olof Bjarnadottir,20,Safnrými er skapað og í raun skilgreint af hön...,Ó Bjarnadóttir,skemman.is,Viðfangsefni ritgerðarinnar er að skoða safnby...
3387,Olof Bjarnadottir,20,"[BOOK][B] Sir Joseph Banks, Iceland and the No...",A Agnarsdóttir,2017,Sir Joseph Banks was one of the great figures ...
3422,Samuel Rotstein,30,Preliminary In-vitro Study of Surface Alterati...,"MK Fossum, E Strömberg, J Sanchez…","TMS 2015 144th Annual …, 2015",Subcutaneous Venous Access Ports (SVAPs) are c...
3446,Sara Kinhult,30,Radiochemotherapy of Esophageal Cancer,M Albertsson,"Proceedings of SCANNING 99 Chicago, Illinois …...","In clinical work, where esophageal cancer is c..."


In [21]:
# Drop duplicates
hcp_df_gs = hcp_df_gs.drop_duplicates().reset_index(drop=True)
hcp_df_gs.shape

(3390, 6)

## Cleaning of relevant data

This part performs the cleaning of the relevant variables.
The cleaning is supposed to take place in the following order:
* 1. title
* 2. hcp_name
* 3. authors
* 4. publication
* 5. journal
* 6. snippet
* 7. abstract
* 8. num_articles
* 9. scraped_from
* 10. save data frame

###  `title`

Let us take a look at the titles.

In [24]:
print_index_and_value(hcp_df_gs, 'title')

0
Neoadjuvant trastuzumab, pertuzumab, and docetaxel vs trastuzumab emtansine in patients with ERBB2-positive breast cancer: a phase 2 randomized clinical trial
1
[HTML][HTML] Adenovirus-mediated CD40L gene transfer increases Teffector/Tregulatory cell ratio and upregulates death receptors in metastatic melanoma patients
2
High PDGFRb Expression Predicts Resistance to Radiotherapy in DCIS within the SweDCIS Randomized Trial
3
[HTML][HTML] Evaluation of diffusion-weighted MRI and FDG-PET/CT to assess response to AdCD40L treatment in metastatic melanoma patients
4
[HTML][HTML] Local irradiation does not enhance the effect of immunostimulatory AdCD40L gene therapy combined with low dose cyclophosphamide in melanoma patients
5
[HTML][HTML] Predictive role of HER2-status on the effectiveness of endocrine adjuvant treatment in postmenopausal breast cancer patients: a population-based cohort …
6
Abstract PO-018: Inflaming advanced solid tumors including pancreatic cancer using LOAd703, a TMZ-

1038
Privacy threat modeling for emerging BiobankClouds
1039
Supported nanostructured Ir and IrRu electrocatalysts for oxygen evolution in PEM electrolysers
1040
Pore structural characteristics, size exclusion properties and column performance of two mesoporous amorphous silicas and their pseudomorphically transformed …
1041
Effect of adjuvant trastuzumab for a duration of 9 weeks vs 1 year with concomitant chemotherapy for early human epidermal growth factor receptor 2–positive breast …
1042
[PDF][PDF] Adjuvant capecitabine, docetaxel, cyclophosphamide, and epirubicin for early breast cancer: final analysis of the randomized FinXX trial
1043
Expression of Ephb2 and Ephb4 in breast carcinoma
1044
[HTML][HTML] Adjuvant capecitabine in combination with docetaxel and cyclophosphamide plus epirubicin for breast cancer: an open-label, randomised controlled trial
1045
Adjuvant capecitabine in combination with docetaxel, epirubicin, and cyclophosphamide for early breast cancer: the randomized

[CITATION][C] Association between parity, histopathological tumor features and survival in high-grade serous ovarian cancer
2163
[CITATION][C] Association between parity, histopathological tumor features and survival in high-grade serous ovarian cancer
2164
[BOOK][B] Prognostic and treatment predictive factors for radio-and chemotherapy resistance in breast cancer patients-a step towards personlized medicine
2165
Vanishing bile duct-like syndrome in a patient with Hodgkin lymphoma–pathological development and restitution
2166
Studie av pressmaskiner för VS-arbete
2167
Franskt i svensk tappning: Studier over franska laanord i svenska dialekter.
2168
[PDF][PDF] Ett svenskt dialektlexikon i startgroparna
2169
[PDF][PDF] Ergonomisk utvärdering av rollatorprototyp” Walker”.
2170
[PDF][PDF] Ordlös kommunikation: En studie om det ordlösa samspelet mellan de yngsta barnen
2171
[PDF][PDF] Förslag på förbättring av verktygsupphängning vid monteringsarbete
2172
[CITATION][C] Ordbok över Sveriges 

We want to remove the string components `[BOOK][B]`, `[CITACION][C]`, `[HTML][HTML]` and `[PDF][PDF]`. In addition, we want to remove the string `\xa0…`, punctuation and stopwords. All titles need to converted to lower case.

The helper function `clean_titles()` does this for us.

In [25]:
hcp_df_gs = clean_titles(hcp_df_gs, 'title', gs = True)
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet
0,Aglaia Schiza,10,neoadjuvant trastuzumab pertuzumab docetaxel v...,"T Hatschek, T Foukakis, J Bjöhle, T Lekberg…","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...
1,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo…","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...
2,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza…","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...
3,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog…","Scientific reports, 2019",The purpose was to evaluate the potential of d...
4,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, SM Mangsbo, J Wenthe…","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...
...,...,...,...,...,...,...
3385,Frida Jakobsson,30,whats big deal,"D Cooper, O Rieger",2021,The dominant mode by which research libraries ...
3386,Frida Jakobsson,30,hur kan vi veta vad vi gör innan vi har gjort ...,"A Öjelid, D Öjelid",2017,För att hantera den moderna tidens komplexa ho...
3387,Frida Jakobsson,30,whats big deal how researchers are navigating ...,"D Cooper, OY Rieger",2021,The dominant mode by which research libraries ...
3388,Frida Jakobsson,30,kaffehus 20,L Olsson,expo.jmg.gu.se,… Moderaterna kontaktade jag genom att mejla d...


We check the titles again.

In [20]:
print_index_and_value(hcp_df_gs, 'title')

0
neoadjuvant trastuzumab pertuzumab docetaxel vs trastuzumab emtansine patients erbb2positive breast cancer phase 2 randomized clinical trial
1
adenovirusmediated cd40l gene transfer increases teffectortregulatory cell ratio upregulates death receptors metastatic melanoma patients
2
high pdgfrb expression predicts resistance radiotherapy dcis swedcis randomized trial
3
evaluation diffusionweighted mri fdgpetct assess response adcd40l treatment metastatic melanoma patients
4
local irradiation enhance effect immunostimulatory adcd40l gene therapy combined low dose cyclophosphamide melanoma patients
5
predictive role her2status effectiveness endocrine adjuvant treatment postmenopausal breast cancer patients populationbased cohort 
6
abstract po018 inflaming advanced solid tumors including pancreatic cancer load703 tmzcd40l41bblarmed oncolytic virus
7
safety intratumoral immunostimulatory load703 gene therapy patients advanced cancer
8
ct301a phase ib study evaluate ro7198457 individualiz

896
prevalence consequences musculoskeletal symptoms symphony orchestra musicians vary gender crosssectional study
897
occurrence coexistence localized musculoskeletal symptoms findings workattending orchestra musiciansan exploratory crosssectional study
898
mixed venous oxygen saturation predicts shortand longterm outcome coronary artery bypass grafting surgery retrospective cohort analysis
899
euroscore ii nterminal probtype natriuretic peptide risk evaluation observational longitudinal study patients undergoing coronary artery bypass graft 
900
increasing rate angiotensinconverting enzyme inhibitorrelated upper airway angiooedema
901
mixed venous oxygen saturation prognostic marker surgery aortic stenosis
902
randomized trial left ventricular assist device destination therapy versus guidelinedirected medical therapy patients advanced heart failure rationale 
903
rise fall ntprobnp aortic valve intervention
904
preoperative ntprobnp independently predicts outcome patients acute coron

den professionella polisen
2031
recycled materials sustainable investments
2032
csr en guide till företagets ansvar
2033
csr och hållbart företagande
2034
unpacking roles shareholder engagement intermediaries a case study engagement process carbon risk
2035
sociala krav som styrmedel offentlig upphandling en forskningsöversikt
2036
empowering youth voters precinct 19 strategies increasing youth voter turnout racial equity framework
2037
surgical treatment skeletal metastases 31 melanoma patients
2038
high expression glycolytic pigment proteins associated worse clinical outcome stage iii melanoma
2039
a biomarker panel predicts recurrencefree survival ulcerated primary cutaneous melanoma
2040
a phase iaib trial dnadependent protein kinase inhibitor dnapki m3814 combination radiotherapy patients advanced solid tumors
2041
a phase iaib trial dnapk inhibitor m3814 combination radiotherapy rt patients pts advanced solid tumors doseescalation results
2042
presence immune cells low tumor prol

3056
her2 cart cells eradicate uveal melanoma tcell therapyresistant human melanoma il2 transgenic nodscid il2 receptor knockout mice
3057
a patientderived xenograft preclinical trial reveals treatment responses resistance mechanism karonudib metastatic melanoma
3058
melanoma patientderived xenografts accurately model disease develop fast guide treatment decisions
3059
risk clinically relevant bleeding warfarintreated patientsinfluence ssri treatment
3060
randomized clinical trial inhibition trpv1 patients nonerosive gastroesophageal reflux disease partial response ppi treatment 
3061
fiveyear outcomes nivolumab patients wildtype braf advanced melanoma
3062
intussusceptive angiogenesis human metastatic malignant melanoma
3063
impaired relaxation stomach smooth muscle mice lacking cyclic gmpdependent protein kinase i
3064
a populationbased comparison ajcc 7th ajcc 8th editions patients diagnosed stage iii cutaneous malignant melanoma sweden
3065
realworld data pd1 inhibitor therapy meta

### Use keywords to filter out irrelevant papers 
We now want to filter out filter out papers that do not have at least one keyword from the `keywords_list` in their title. 

The helper function `filter_by_keywords()` does this for us.

In [26]:
# Read in the keywords
with open('keywords_list.txt', 'r') as f:
    keywords_list = json.loads(f.read())

In [27]:
keywords_list

['cancer genetics',
 'prospective studies',
 'primary brain tumor',
 'implantable cardioverter‐defibrillator',
 'cognitive functions',
 'fractional polynomials',
 'brain neoplasm',
 'mgmt testing',
 'freshwater toxicology',
 'post recurrence mortality',
 'invasive lobular carcinoma',
 'aortic disease',
 'chemoradiotherapy',
 'quality of life',
 'morris water maze',
 'macrophages',
 'secondary lymphedema',
 'prognosis',
 'perioperative care',
 'european medicines agency',
 'blood-brain barrier',
 'phenotypic compensation',
 'breast cancer screening',
 'protein-protein interaction',
 'driver genes',
 'neuregulins',
 'impt',
 'genetic biomarker',
 'sentinel node metastases',
 'cortical bone',
 'multipotency',
 'genetic epidemiology',
 'patient safety',
 'tyrosine kinase inhibitor',
 'purine nucleoside phosphorylase',
 'blood pressure',
 'decoding',
 'castration sensitive',
 'myocardial ischemia',
 'nursing education',
 'workplace violence',
 'brain metastases',
 'microenvironment',
 'neoa

In [28]:
len(keywords_list)

2786

In [31]:
hcp_df_gs = filter_by_keywords(hcp_df_gs) 
hcp_df_gs.shape

(3311, 6)

In [32]:
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet
0,Aglaia Schiza,10,neoadjuvant trastuzumab pertuzumab docetaxel v...,"T Hatschek, T Foukakis, J Bjöhle, T Lekberg…","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...
1,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo…","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...
2,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza…","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...
3,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog…","Scientific reports, 2019",The purpose was to evaluate the potential of d...
4,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, SM Mangsbo, J Wenthe…","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...
...,...,...,...,...,...,...
3306,Frida Jakobsson,30,predicting toxicity caused highdoseratebrachyt...,D Estefan,2019,Aim To determine which parameters correlate to...
3307,Frida Jakobsson,30,alternative access vaihtoehtoinen saatavuus mo...,J Leppämäki,"Signum, 2020",UM 1/2020 alussa eikä pääsy artikkeleihin katk...
3308,Frida Jakobsson,30,hur kan vi veta vad vi gör innan vi har gjort ...,"A Öjelid, D Öjelid",2017,För att hantera den moderna tidens komplexa ho...
3309,Frida Jakobsson,30,whats big deal how researchers are navigating ...,"D Cooper, OY Rieger",2021,The dominant mode by which research libraries ...


### `hcp_name`

First, we explore the names in the column `hcp_name`.

In [27]:
hcp_df_gs['hcp_name'].unique().tolist()

['Aglaia Schiza',
 'Agneta Nordin Danfors',
 'Alaa Haidar',
 'Ana Bosch Campos',
 'Andreas Nearchou',
 'Ulrika Bergqvist',
 'Ann Charlotte Dreifaldt',
 'Elisabeth Ryd Ausén',
 'Anna Nordenskjöld',
 'Anna von Wachenfeldt Väppling',
 'Anna-Karin Tzikas',
 'Anna-Karin Wennstig',
 'Anna Maria Hasselgren Häll',
 'Anne-Kristine Andersson',
 'Antonios Valachis',
 'Elisabet Karlsson',
 'Barbro Linderholm',
 'Birgitta Lind',
 'Kristina Lindblom',
 'Maria Sandström',
 'Cecilia Graffman',
 'Cecilia Nilsson',
 'Chaido Chamalidou',
 'Charlotte Bratthäll',
 'Christina Haapaniemi Olsson',
 'Christina Linder Stragliotto',
 'Malin Steenhoff',
 'Therése Widerberg',
 'Claudia Lundgren',
 'Dan Lundstedt',
 'Yvonne Wengström',
 'Elisabet Lidbrink',
 'Elzbieta Wojtyna-Dziedzic',
 'Eva af Trampe',
 'Ann-Britt Nilsson',
 'Eva Tallroth',
 'Evangelos Digkas',
 'Fredrika Killander',
 'Gabriel Jonsson',
 'Gerhard Winblad',
 'Gilberto Morgan',
 'Git Martenhed',
 'Greger Nilsson',
 'Marika Hjelmqvist',
 'Helena Gra

The names in the column `hcp_name` do not follow a regular format: some names feature only one first name and once last name, e.g., Aglaia Schiza, while other names feature two first names and one last name, e.g., Anne Charlotte Dreifaldt, one first name and two last names, e.g., Ana Bosch Campos. Names connected by a hyphen are considered to be one name.

In the following, our convention for names is to only keep the first letter of the first name and the last name. The helper function `truncate_names_1()` does this for us.

In [33]:
hcp_df_gs = truncate_names_1(hcp_df_gs, 'hcp_name')
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet
0,Aglaia Schiza,10,neoadjuvant trastuzumab pertuzumab docetaxel v...,"T Hatschek, T Foukakis, J Bjöhle, T Lekberg…","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...
1,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo…","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...
2,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza…","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...
3,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog…","Scientific reports, 2019",The purpose was to evaluate the potential of d...
4,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, SM Mangsbo, J Wenthe…","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...
...,...,...,...,...,...,...
3306,Frida Jakobsson,30,predicting toxicity caused highdoseratebrachyt...,D Estefan,2019,Aim To determine which parameters correlate to...
3307,Frida Jakobsson,30,alternative access vaihtoehtoinen saatavuus mo...,J Leppämäki,"Signum, 2020",UM 1/2020 alussa eikä pääsy artikkeleihin katk...
3308,Frida Jakobsson,30,hur kan vi veta vad vi gör innan vi har gjort ...,"A Öjelid, D Öjelid",2017,För att hantera den moderna tidens komplexa ho...
3309,Frida Jakobsson,30,whats big deal how researchers are navigating ...,"D Cooper, OY Rieger",2021,The dominant mode by which research libraries ...


We add the variable `hcp_name_short`.

In [34]:
names_list = []
for i in range(len(hcp_df_gs)):
    names = hcp_df_gs['hcp_name'][i].split(" ")
    name = names[0][0] + ' ' + names[-1]
    names_list += [name]
names_list 

hcp_df_gs['hcp_name_short'] = names_list
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet,hcp_name_short
0,Aglaia Schiza,10,neoadjuvant trastuzumab pertuzumab docetaxel v...,"T Hatschek, T Foukakis, J Bjöhle, T Lekberg…","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...,A Schiza
1,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo…","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...,A Schiza
2,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza…","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...,A Schiza
3,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog…","Scientific reports, 2019",The purpose was to evaluate the potential of d...,A Schiza
4,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, SM Mangsbo, J Wenthe…","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...,A Schiza
...,...,...,...,...,...,...,...
3306,Frida Jakobsson,30,predicting toxicity caused highdoseratebrachyt...,D Estefan,2019,Aim To determine which parameters correlate to...,F Jakobsson
3307,Frida Jakobsson,30,alternative access vaihtoehtoinen saatavuus mo...,J Leppämäki,"Signum, 2020",UM 1/2020 alussa eikä pääsy artikkeleihin katk...,F Jakobsson
3308,Frida Jakobsson,30,hur kan vi veta vad vi gör innan vi har gjort ...,"A Öjelid, D Öjelid",2017,För att hantera den moderna tidens komplexa ho...,F Jakobsson
3309,Frida Jakobsson,30,whats big deal how researchers are navigating ...,"D Cooper, OY Rieger",2021,The dominant mode by which research libraries ...,F Jakobsson


### `authors`

First of all, explore the names and clean them.

In [35]:
print_index_and_value(hcp_df_gs, 'authors')

0
T Hatschek, T Foukakis, J Bjöhle, T Lekberg… 
1
A Schiza, J Wenthe, S Mangsbo… 
2
C Strell, D Folkvaljon, E Holmberg, A Schiza… 
3
A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog… 
4
S Irenaeus, A Schiza, SM Mangsbo, J Wenthe… 
5
A Schiza, D Mauri, I Fredriksson, AK Wennstig… 
6
J Wenthe, E Eriksson, L Sandin, T Lövgren, JL Jarblad… 
7
S Irenaeus, J Wenthe, E Eriksson, A Schiza… 
8
JS Lopez, R Camidge, M Iafolla, S Rottey, M Schuler… 
9
A Schiza 
10
A Schiza 
11
A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog… 
12
H Lindman, A Haji, M Jernling… 
13
A Schiza, Y Naeser, A Sundin… 
14
Y Naeser, H Helgadottir, Y Brandberg, J Hansson… 
15
J Sun, K Tuncay, AA Haidar… 
16
K Tuncay, L Ensman, J Sun, AA Haidar… 
17
M Chbat, M Rakka, A Haidar, N Sabbah… 
18
M Rakka, A Haidar, N Bazzi, AE Safadi… 
19
AH Haidar, M Houseini, M Kshour 
20
M Elkabets, S Vora, D Juric, N Morse… 
21
M Aine, C Boyaci, J Hartman, J Häkkinen, S Mitra… 
22
S Chandarlapaty, M Scaltriti, M Will, Z Li… 
23
J Staaf, D Glodzik, A

1081
E Larsson, J Westberg 
1082
S Rimm 
1083
E Ulff, M Maroti, J Serup, U Falkmer 
1084
E Ulff, M Maroti, J Serup, M Nilsson… 
1085
M Maroti, E Ulff, B Wijma 
1086
E Ulff, M Maroti, J Serup 
1087
M Maroti, E Ulff, J Lyth, U Falkmer 
1088
E Ulff, M Maroti, J Serup, M Nilsson… 
1089
E Ulff, C Melin-Johansson, M Maroti… 
1090
AE Ulff, M Maroti 
1091
AE Ulff, M Maroti 
1092
G Burke, S Faithfull, H Probst 
1093
AC Geller, AJ Sober, Z Zhang, DR Brooks, DR Miller… 
1094
K Lasithiotakis, U Leiter, F Meier… 
1095
GE Chacaliaza Andia, MI Espinoza Berrospi 
1096
L Ring, Å Kettis‐lindblad, KI Kjellgren… 
1097
D Oliva 
1098
K Rosengren 
1099
L Norberg, R Johansson, T Rasmuson 
1100
L Norberg, R Johansson, T Rasmuson 
1101
P Tomani, P Axegård, L Norberg, LE Åkerlund 
1102
R Ziesig, P Tomani, H Schweinebarth… 
1103
S Brodin, K Johansson, L Norberg, K Vuollet 
1104
L Norberg, R Johansson, T Rasmuson 
1105
LJ Norberg 
1106
L Norberg, T Rasmuson 
1107
F Aldaeus, K Larsson, J Stevanic Srndovic… 
1108
L 

J Sandberg, E Sjöström 
1981
E Geijer Madsen, E Sjöström 
1982
E Sjöström 
1983
S Molavi, E Sjöström 
1984
E Sjöström 
1985
M Macquet, E Sjöström 
1986
T Borglund, H De Geer, S Sweet, M Frostenson… 
1987
H Setterberg, E Sjöström 
1988
E Sjöström 
1989
L Engström, E Sjöström Reinius 
1990
E Blomqvist, N Waltré, E Sjöström 
1991
T Borglund, H De Geer, S Sweet, M Frostenson… 
1992
JP Gond, E Sjöström 
1993
M Frostenson, E Sjöström 
1994
E Dunn, S Lee, E Sjostrom 
1995
R WEDin, J FAlKEniUS, RJ WEiSS… 
1996
J Falkenius, J Lundeberg, H Johansson… 
1997
J Falkenius, J Keskitalo, L Kanter, H Johansson… 
1998
B Van Triest, L Damstrup, J Falkenius, V Budach… 
1999
B Van Triest, L Damstrup, J Falkenius, V Budach… 
2000
J Falkenius, H Johansson, R Tuominen, MF Stolt… 
2001
J Falkenius 
2002
R Henriksson, J Falkenius, S Norin, D Öhman… 
2003
H Helgadottir, J Falkenius, H Eriksson, A Girnita… 
2004
L De Petris, S Friesland, D Brodin, H Carstens… 
2005
J Hansson, E Djureen-Mårtenson… 
2006
S Asghari 

J Vallon-Christersson, C Cayanan… 
3115
J Staaf, G Jönsson, M Ringnér… 
3116
K Haraldsson, N Loman, QX Zhang, O Johannsson… 
3117
D Glodzik, A Bosch, J Hartman, M Aine… 
3118
S Kimbung, N Loman, I Hedenfalk 
3119
AE Isern, I Tengrup, N Loman, H Olsson… 
3120
CA Maxwell, J Benítez, L Gómez-Baldó, A Osorio… 
3121
Q Romero, PO Bendahl, M Klintman, N Loman… 
3122
AC Antoniou, KB Kuchenbaecker, P Soucy… 
3123
T Bachelot, E Ciruelos, A Schneeweiss, F Puglisi… 
3124
S Håkansson, O Johannsson… 
3125
NP Tobin, JC Harrell, J Lövrot, SE Brage, MF Stolt… 
3126
Y Naeser, H Helgadottir, Y Brandberg, J Hansson… 
3127
F López-Medrano, O del Val-Muñoz… 
3128
F López-Medrano, O Val-Muñoz… 
3129
O Maller, AP Drain, AS Barrett, S Borgquist, B Ruffell… 
3130
O Bjarnadottir, Q Romero, PO Bendahl… 
3131
M Feldt, O Bjarnadottir… 
3132
O Bjarnadottir, S Kimbung, I Johansson, S Veerla… 
3133
O Bjarnadottir, M Feldt, M Inasu, PO Bendahl… 
3134
O Maller, AP Drain, AS Barrett, S Borgquist, B Ruffell… 
3135
F Hallm

We can see from above that the author strings have the following structure: `author_1, author_2, ..., author_n` and either end with `…\xa0`, `… `, ` ` or `\xa0`. We will now look a little deeper into these cases.

Case 1: author string ends with `…\xa0`

In [36]:
for i in range(len(hcp_df_gs)):
    if "\xa0" in hcp_df_gs['authors'][i]:
        if "…" in hcp_df_gs['authors'][i]:
            print(i)
            print(hcp_df_gs['authors'][i])

0
T Hatschek, T Foukakis, J Bjöhle, T Lekberg… 
1
A Schiza, J Wenthe, S Mangsbo… 
2
C Strell, D Folkvaljon, E Holmberg, A Schiza… 
3
A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog… 
4
S Irenaeus, A Schiza, SM Mangsbo, J Wenthe… 
5
A Schiza, D Mauri, I Fredriksson, AK Wennstig… 
12
H Lindman, A Haji, M Jernling… 
13
A Schiza, Y Naeser, A Sundin… 
14
Y Naeser, H Helgadottir, Y Brandberg, J Hansson… 
15
J Sun, K Tuncay, AA Haidar… 
16
K Tuncay, L Ensman, J Sun, AA Haidar… 
17
M Chbat, M Rakka, A Haidar, N Sabbah… 
18
M Rakka, A Haidar, N Bazzi, AE Safadi… 
20
M Elkabets, S Vora, D Juric, N Morse… 
21
M Aine, C Boyaci, J Hartman, J Häkkinen, S Mitra… 
23
J Staaf, D Glodzik, AB Campos… 
24
M Elkabets, S Vora, D Juric, N Morse… 
27
J Staaf, D Glodzik, A Bosch, J Vallon-Christersson… 
28
D Glodzik, A Bosch, J Hartman, M Aine… 
32
EE Pakos, AD Nearchou, RJ Grimer… 
35
V Golfinopoulos, G Pentheroudakis, G Salanti… 
46
A Valachis, NP Polyzos, A Nearchou, P Lind… 
47
A Valachis, A Nearchou, NP Pol

1759
S Kimbung, A Kovács, PO Bendahl, P Malmström… 
1760
T Hatschek, T Foukakis, J Bjöhle, T Lekberg… 
1761
M Carlsson, M Arman, M Backman, U Flatters… 
1762
MA Alzubi, TH Turner, AL Olex, SS Sohal… 
1763
S Kimbung, A Kovács, A Danielsson, PO Bendahl… 
1764
A Saracco, BK Szabó, E Tánczos, J Bergh… 
1765
T Foukakis, J Lövrot, A Matikas, I Zerdes… 
1766
S Kimbung, I Johansson, A Danielsson, S Veerla… 
1768
J Rosell, B Nordenskjöld, NO Bengtsson… 
1769
LH Zetterlund, J Frisell, A Zouzos, R Axelsson… 
1770
U Kjällquist, R Erlandsson, NP Tobin… 
1771
B Franzén, A Alexeyenko… 
1772
JM Carstensen, G Wingren, T Hatschek… 
1773
A Matikas, J Lövrot, A Ramberg, M Eriksson… 
1774
L Rydén, M Haglund, PO Bendahl, T Hatschek… 
1775
T Foukakis, T Fornander, T Lekberg, H Hellborg… 
1776
F Van Der Leij, SCJ Bosma, MJ Van De Vijver… 
1777
S Kimbung, I Markholm, J Bjöhle… 
1778
T Hatschek, T Foukakis, J Bjöhle, T Lekberg… 
1779
J Engstrand, N Kartalis, C Strömberg, M Broberg… 
1781
SCJ Bosma, F Leij, S Vr

Case 2: author string ends with `\xa0`

In [37]:
for i in range(len(hcp_df_gs)):
    if "\xa0" in hcp_df_gs['authors'][i]:
        if "…" not in hcp_df_gs['authors'][i]:
            print(i)
            print(hcp_df_gs['authors'][i])

19
AH Haidar, M Houseini, M Kshour 
29
F Pareja, JS Reis-Filho 
30
F Pareja, JS Reis-Filho 
33
AC Nearchou 
34
A Valachis, AD Nearchou, P Lind 
36
AC Nearchou 
37
AC Nearchou 
38
AC Nearchou 
39
AC Nearchou 
40
PT Zacharia, AC Nearchou 
41
PT Zacharia, AC Nearchou 
42
AC Nearchou 
43
AC Nearchou 
44
AC Nearchou 
45
AC Nearchou, SL Omirou 
48
A Valachis, A Nearchou, P Lind, D Mauri 
49
AC Nearchou 
50
PT Zacharia, AC Nearchou 
51
A Valachis, A Nearchou 
52
AC Nearchou 
54
DI Petropoulos, AC Nearchou 
55
AC Nearchou, NA Aspragathos 
57
PN Azariadis, AC Nearchou, NA Aspragathos 
58
SL Omirou, AC Nearchou 
59
AC Nearchou 
60
AC Nearchou 
66
AC Dreifaldt, M Carlberg, L Hardell 
67
J Karlsson, AC Dreifaldt, LB Mordhorst, B Sorbe 
70
L Hardell, AC Dreifaldt 
76
AC Dreifaldt, LB Mordhorst, BG Sorbe 
80
O Johansson 
92
KM Eggers, J Oldgren, A Nordenskjöld, B Lindahl 
104
A Nordenskjöld, P Güney, AM Nordenskjöld 
128
AK Tzikas, S Nemes, BK Linderholm 
146
A Arslan, E Aktas, B Sengul, B Tekin 
15

Case 3: author string ends with `… `

In [38]:
for i in range(len(hcp_df_gs)):
    if "\xa0" not in hcp_df_gs['authors'][i]:
        if "…" in hcp_df_gs['authors'][i]:
            print(i)
            print(hcp_df_gs['authors'][i])

6
J Wenthe, E Eriksson, L Sandin, T Lövgren, JL Jarblad… 
7
S Irenaeus, J Wenthe, E Eriksson, A Schiza… 
8
JS Lopez, R Camidge, M Iafolla, S Rottey, M Schuler… 
11
A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog… 
22
S Chandarlapaty, M Scaltriti, M Will, Z Li… 
71
JCS Bergh, A Andersson, J Bjohle, A Bosch… 
73
Y Brandberg, A Andersson, J Bjohle, A Bosch… 
78
L Hardell, B Van Bavel, G Lindström, M Carlberg… 
81
G Lindström, A Kärrman, B van Bavel, L Hardell… 
117
S Pasupathy, B Lindahl, P Litwin, R Tavella… 
120
AE Nordenskjöld, H Fohlin, LG Arnesson, Z Einbeigi… 
123
A Andersson, A Von Wachenfeldt Väppling, A De Jong… 
125
T Hatschek, J Bjöhle, E Lidbrink, T Lekberg, N Loman… 
129
S Janeva, C Zhang, A Kovács, TZ Parris, AK Tzikas… 
144
R Kornalijnslijper-Altena, A Andersson, Y Brandberg… 
145
AK Wennstig, C Wadsten, H Garmo, F Wärnberg… 
168
K Krasagakis, A Valachis, P Maniatakis… 
184
R Kornalijnslijper-Altena, A Andersson, Y Brandberg… 
185
A Valachis, S Autexier, I Grau, L Itu, D Jako

Case 4: author string ends with ` `

In [39]:
for i in range(len(hcp_df_gs)):
    if "\xa0" not in hcp_df_gs['authors'][i]:
        if "…" not in hcp_df_gs['authors'][i]:
            print(i)
            print(hcp_df_gs['authors'][i])

9
A Schiza 
10
A Schiza 
25
A Bosch Campos 
26
AB Campos 
31
M Sjöström 
61
U Bergqvist, A Hermansson 
62
U Bergqvist 
63
KJ Hellgren 
64
M Tränefors, P Stening 
74
AC Dreifaldt, M Carlberg, L Hardell 
75
AC Dreifaldt 
77
AC Dreifaldt, B Werner, M Carlberg, L Hardell 
79
K Olsson 
82
A Missirliu 
83
R Randén 
87
M Cunninghamn 
89
E Björk 
90
S Anttonen 
91
E Lindqvist 
116
A Nordenskjöld 
119
A Nordenskjöld 
124
AM Karumo, L Thorén, A Von Wachenfeldt Väppling 
126
C Wendt 
127
N Jansson Bertheussen, L Welander 
140
AK Wennstig 
156
M Lichinitser, V Jenkins, AS Muñoz, Z Machackova 
159
D Komp 
160
MG Liljefors 
162
AK Petersen, J Andersson, AB Rasmussen 
190
A Valachis 
191
L Breimer, A Valachis, L Olsson 
194
E Karlsson 
197
AME Karlsson 
198
E Karlsson 
199
E Karlsson 
200
CH Daub, YE Karlsson, S Stiller 
201
CH Daub, YE Karlsson 
202
CH Daub, YE Karlsson, S Stiller 
203
YE Karlsson, CH Daub 
206
E Karlsson 
212
E Hallberg 
213
S Karlberg, H Eriksson 
214
H Eriksson, S Karlberg 
215
C

Looking through the author names, we also see that there are some authors whose names feature apostrophes. 

Example: S Na'ara (index 379)

We now take a deeper look at those names.

In [40]:
for i in range(len(hcp_df_gs)):
    if "'" in hcp_df_gs['authors'][i]:
        print(i)
        print(hcp_df_gs['authors'][i])

379
Y Binenbaum, S Na'ara, Z Gil 
859
M Buyse, S Loi, L Van't Veer, G Viale… 
984
C Weadick, K Larsson, S O'Reilly, E McMahon… 
1382
C Parker, D Heinrich, JM O'Sullivan… 
1386
AO Sartor, D Heinrich, JM O'Sullivan… 
1387
O Sartor, D Heinrich, SI Helle, JM O'Sullivan… 
1722
KB Meyer, M O'Reilly, K Michailidou, S Carlebur… 
1965
ED Crawford, AB Barqawi, C O'Donnell… 
2544
R Fabia, A Ar'Rajab, ML Johansson, R Andersson… 
3045
BM Simões, CS O'Brien, R Eyre, A Silva, L Yu… 
3201
F Fata, IG Ron, N Kemeny, E O'Reilly… 


When at least one author name contains an apostrophe as part of the name, the author string is in double quotes.

We see that there are no authors strings that contain double quotes.

In [42]:
for i in range(len(hcp_df_gs)):
    if '"' in hcp_df_gs['authors'][i]:
        print(i)
        print(hcp_df_gs['authors'][i])

We now want to remove the `…\xa0`, `\xa0`, `…` and ` ` at the end of the author strings. Furthermore, we want to replace `'` by ` `.

The helper function `clean_authors()` does this for us.

In [43]:
hcp_df_gs = clean_authors(hcp_df_gs, gs=True)
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet,hcp_name_short
0,Aglaia Schiza,10,neoadjuvant trastuzumab pertuzumab docetaxel v...,"T Hatschek, T Foukakis, J Bjöhle, T Lekberg","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...,A Schiza
1,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...,A Schiza
2,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...,A Schiza
3,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Ortiz-Nieto, A Loskog","Scientific reports, 2019",The purpose was to evaluate the potential of d...,A Schiza
4,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, SM Mangsbo, J Wenthe","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...,A Schiza
...,...,...,...,...,...,...,...
3306,Frida Jakobsson,30,predicting toxicity caused highdoseratebrachyt...,D Estefan,2019,Aim To determine which parameters correlate to...,F Jakobsson
3307,Frida Jakobsson,30,alternative access vaihtoehtoinen saatavuus mo...,J Leppämäki,"Signum, 2020",UM 1/2020 alussa eikä pääsy artikkeleihin katk...,F Jakobsson
3308,Frida Jakobsson,30,hur kan vi veta vad vi gör innan vi har gjort ...,"A Öjelid, D Öjelid",2017,För att hantera den moderna tidens komplexa ho...,F Jakobsson
3309,Frida Jakobsson,30,whats big deal how researchers are navigating ...,"D Cooper, OY Rieger",2021,The dominant mode by which research libraries ...,F Jakobsson


It can be seen that the above cleaning steps have worked out.

In order for the author names to following our convention, we now need to truncate the names: keep only first letter of first name and last name. Again, the helper function `truncate_names_2()` does this for us.

In [44]:
hcp_df_gs = truncate_names_2(hcp_df_gs, 'authors')
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet,hcp_name_short
0,Aglaia Schiza,10,neoadjuvant trastuzumab pertuzumab docetaxel v...,"T Hatschek, T Foukakis, J Bjöhle, T Lekberg","JAMA …, 2021",Importance Trastuzumab emtansine (T-DM1) is pr...,A Schiza
1,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...,A Schiza
2,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...,A Schiza
3,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Nieto, A Loskog","Scientific reports, 2019",The purpose was to evaluate the potential of d...,A Schiza
4,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, S Mangsbo, J Wenthe","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...,A Schiza
...,...,...,...,...,...,...,...
3306,Frida Jakobsson,30,predicting toxicity caused highdoseratebrachyt...,D Estefan,2019,Aim To determine which parameters correlate to...,F Jakobsson
3307,Frida Jakobsson,30,alternative access vaihtoehtoinen saatavuus mo...,J Leppämäki,"Signum, 2020",UM 1/2020 alussa eikä pääsy artikkeleihin katk...,F Jakobsson
3308,Frida Jakobsson,30,hur kan vi veta vad vi gör innan vi har gjort ...,"A Öjelid, D Öjelid",2017,För att hantera den moderna tidens komplexa ho...,F Jakobsson
3309,Frida Jakobsson,30,whats big deal how researchers are navigating ...,"D Cooper, O Rieger",2021,The dominant mode by which research libraries ...,F Jakobsson


We take a final look at the authors.

In [45]:
print_index_and_value(hcp_df_gs, 'authors')

0
T Hatschek, T Foukakis, J Bjöhle, T Lekberg
1
A Schiza, J Wenthe, S Mangsbo
2
C Strell, D Folkvaljon, E Holmberg, A Schiza
3
A Schiza, S Irenaeus, F Nieto, A Loskog
4
S Irenaeus, A Schiza, S Mangsbo, J Wenthe
5
A Schiza, D Mauri, I Fredriksson, A Wennstig
6
J Wenthe, E Eriksson, L Sandin, T Lövgren, J Jarblad
7
S Irenaeus, J Wenthe, E Eriksson, A Schiza
8
J Lopez, R Camidge, M Iafolla, S Rottey, M Schuler
9
A Schiza
10
A Schiza
11
A Schiza, S Irenaeus, F Nieto, A Loskog
12
H Lindman, A Haji, M Jernling
13
A Schiza, Y Naeser, A Sundin
14
Y Naeser, H Helgadottir, Y Brandberg, J Hansson
15
J Sun, K Tuncay, A Haidar
16
K Tuncay, L Ensman, J Sun, A Haidar
17
M Chbat, M Rakka, A Haidar, N Sabbah
18
M Rakka, A Haidar, N Bazzi, A Safadi
19
A Haidar, M Houseini, M Kshour
20
M Elkabets, S Vora, D Juric, N Morse
21
M Aine, C Boyaci, J Hartman, J Häkkinen, S Mitra
22
S Chandarlapaty, M Scaltriti, M Will, Z Li
23
J Staaf, D Glodzik, A Campos
24
M Elkabets, S Vora, D Juric, N Morse
25
A Campos
26


1065
T Pesonen
1066
A Alves
1067
L Näsholm
1068
S Uthby, S Söder
1069
J Gligorov, S Richard
1070
Å Wickberg, G Liljegren, F Killander, H Lindman
1071
T Bachelot, E Ciruelos, A Schneeweiss, F Puglisi
1072
K Engvall, H Gréen, M Fredriksson
1073
L Kobyletzki, A Berner, F Carlstedt, M Hasselgren
1074
C CIPN, C Bolund
1075
E Coleman
1076
E Wu, J Mårtensson, L Desta, A Broström
1077
E Wu, L Desta, A Broström
1078
E Wu, J Mårtensson, L Desta
1079
M Severinson
1080
J Åström
1081
E Larsson, J Westberg
1082
S Rimm
1083
E Ulff, M Maroti, J Serup, U Falkmer
1084
E Ulff, M Maroti, J Serup, M Nilsson
1085
M Maroti, E Ulff, B Wijma
1086
E Ulff, M Maroti, J Serup
1087
M Maroti, E Ulff, J Lyth, U Falkmer
1088
E Ulff, M Maroti, J Serup, M Nilsson
1089
E Ulff, C Johansson, M Maroti
1090
A Ulff, M Maroti
1091
A Ulff, M Maroti
1092
G Burke, S Faithfull, H Probst
1093
A Geller, A Sober, Z Zhang, D Brooks, D Miller
1094
K Lasithiotakis, U Leiter, F Meier
1095
G Andia, M Berrospi
1096
L Ring, Å Kettis‐lindbla

T Rebbeck, N Mitra, F Wan, O Sinilnikova, S Healey
1854
K Kuchenbaecker, S Ramus, J Tyrer, A Lee
1855
E Shubbar, A Kovács, S Hajizadeh, T Parris, S Nemes
1856
N Tobin, J Harrell, J Lövrot, S Brage, M Stolt
1857
D Cox, J Simard, D Sinnett, Y Hamdi
1858
T Hatschek, T Foukakis, J Bjöhle, T Lekberg
1859
A Rohlin, Y Engwall, K Fritzell, K Göransson
1860
S Kimbung, A Kovács, A Danielsson, P Bendahl
1861
T Foukakis, J Lövrot, A Matikas, I Zerdes
1862
A Bergman, Z Einbeigi, U Olofsson, Z Taib
1863
S Kimbung, I Johansson, A Danielsson, S Veerla
1864
J Bergh, A Andersson, J Bjohle, A Bosch
1865
A Bergman, A Flodin, Y Engwall, E Arkblad, K Berg
1866
C Zeng, X Guo, J Long, K Kuchenbaecker
1867
E Shubbar, K Helou, A Kovács, S Nemes
1868
H Svensson, Y Brandberg, Z Einbeigi, T Hatschek
1869
J Nyqvist, T Parris, K Helou, E Sarenmalm
1870
H Svensson, T Hatschek, H Johansson, Z Einbeigi
1871
Y Hamdi, P Soucy, K Kuchenbaeker
1872
J Nyqvist, F Persson, T Parris, K Helou
1873
C Wendt, T Muranen, L Mielikäi

A Jögi, D Brennan, L Rydén, K Magnusson
3065
R Søkilde, H Persson, A Ehinger
3066
C Brueffer, J Christersson, D Grabau
3067
C Larsson
3068
C Larsson, A Syberfeldt, K Säfsten
3069
C Larsson, M Strand, A Persson
3070
A Bång, M Gustavsson, C Larsson, S Holmberg
3071
C Larsson, K Säfsten
3072
C Larsson
3073
C Larsson, K Säfsten, A Syberfeldt
3074
C Larsson
3075
C Larsson, C Rönnberg
3076
C Larsson, K Säfsten, A Syberfeldt
3077
A Bengtsson, C Larsson, R Sinkjaer
3078
C Larsson, H Sundström
3079
A Gustafsson, C Larsson, H Sundström
3080
M Öberg, C Larsson
3081
C Larsson
3082
C Larsson
3083
C Larsson, A Mårtensson
3084
C Larsson, M Mähönen
3085
E Brunbäck, C Larsson
3086
C Larsson
3087
C Larsson
3088
C Larsson
3089
A Jonsson, C Larsson, H Roos
3090
C Larsson
3091
C Larsson, M Nilsson
3092
C Larsson
3093
L Elowson, C Larsson
3094
C Larsson
3095
J Näslund, C Larsson
3096
A Antoniou, P Pharoah, S Narod, H Risch
3097
I Hedenfalk, D Duggan, Y Chen
3098
A Tutt, M Robson, J Garber, S Domchek, M Aude

### Filtering of papers by correspondence of hcp_name and authors

As we saw above, sometimes papers are scraped for certain HCPs even if there are no papers of them on PubMed at all. 

Example: For Ahmed Abbas Albu-Kareem (A Albu-Kareem), there are no papers on PubMed. Nevertheless, PyMed scrapes 100 papers, the maximum amount of papers, for him.

Therefore, we need to remove those papers where the name from `hcp_name` does not show up in the names from `authors`.

The helper function `remove_wrongly_scraped_papers()` does this for us.

In [46]:
hcp_df_gs = remove_wrongly_scraped_papers(hcp_df_gs)
hcp_df_gs.shape

(1449, 7)

In [47]:
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,publications,snippet,hcp_name_short
0,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo","Journal of …, 2017",Malignant melanoma is an aggressive tumor sens...,A Schiza
1,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza","Clinical Cancer …, 2021",Purpose: This study analyzes the potential of ...,A Schiza
2,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Nieto, A Loskog","Scientific reports, 2019",The purpose was to evaluate the potential of d...,A Schiza
3,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, S Mangsbo, J Wenthe","Oncotarget, 2017",Background AdCD40L is an immunostimulatory gen...,A Schiza
4,Aglaia Schiza,10,predictive role her2status effectiveness endoc...,"A Schiza, D Mauri, I Fredriksson, A Wennstig","Breast Cancer Research …, 2021",Purpose There are conflicting results on the p...,A Schiza
...,...,...,...,...,...,...,...
1444,Frida Jakobsson,20,branschkontoplaner en studie av bas lantbruk o...,"S Axelsson, F Jakobsson, L Olsson",2009,Titel: Branschkontoplaner-En studie av BAS lan...,F Jakobsson
1445,Frida Jakobsson,20,hållbarhetsredovisning ett gap mellan utbud oc...,"F Jakobsson, S Axelsson",2011,Syfte: Att kartlägga utbredningen av hållbarhe...,F Jakobsson
1446,Frida Jakobsson,20,endometrios den långa vägen till diagnos,"M Gullbrand, F Jakobsson",2016,Titel: Endometrios och den långa vägen till di...,F Jakobsson
1447,Frida Jakobsson,30,beräkning av ersättning för inkomstförlust vid...,"F Jakobsson, J Boo",2006,Ersättning för inkomstförlust vid personskada ...,F Jakobsson


### Split `publications` into `journal` and `publications_year`

Let us take a look at the publications.

In [49]:
print_index_and_value(hcp_df_gs, 'publications')

0
Journal of …, 2017 
1
Clinical Cancer …, 2021 
2
Scientific reports, 2019 
3
Oncotarget, 2017 
4
Breast Cancer Research …, 2021 
5
2021 
6
2021 
7
2017 
8
2017 
9
Clin Oncol Case Rep 4, 2021 
10
Algorithms for …, 2007 
11
In silico …, 2007 
12
Journal of Surgery …, 2021 
13
JOJ Case …, 2019 
14
International Journal of …, 2014 
15
Breast Cancer …, 2020 
16
2013 
17
2013 
18
European journal of …, 2009 
19
Mechanism and machine theory, 1998 
20
Breast cancer research and treatment, 2014 
21
International Journal of Production Economics, 2011 
22
International Journal of Production Economics, 2006 
23
The International Journal of Advanced Manufacturing …, 2007 
24
International Journal of Production Economics, 2004 
25
Journal of Intelligent Manufacturing, 2012 
26
Engineering Applications of Artificial …, 2016 
27
Computers & Operations Research, 2008 
28
International Journal of Production Research, 2008 
29
Robotica, 1998 
30
Journal of Heuristics, 2006 
31
Journal of Clinical …, 20

1046
Oecologia, 2014 
1047
Scandinavian journal of primary health …, 1997 
1048
2007 
1049
2015 
1050
2005 
1051
2008 
1052
1995 
1053
1995 
1054
Onkologidagarna, 19–22 …, 2018 
1055
2014 
1056
The American journal of …, 2000 
1057
The Journal of cell …, 2007 
1058
… American journal of …, 2002 
1059
Applied and …, 1996 
1060
Molecular biology of …, 2005 
1061
Journal of lipid …, 2001 
1062
Digestion, 1993 
1063
Developmental cell, 2009 
1064
Molecular biology of …, 2003 
1065
Journal of medical …, 2004 
1066
Journal of molecular …, 1998 
1067
Critical …, 2005 
1068
Developmental cell, 2019 
1069
Holz als Roh-und …, 2001 
1070
Wood science and …, 2012 
1071
Molecular and General Genetics …, 1993 
1072
BMC clinical pharmacology, 2011 
1073
Holz als Roh-und Werkstoff, 2007 
1074
2007 
1075
Holz als Roh-und …, 2001 
1076
2009 
1077
International journal of …, 2001 
1078
Neurobiology of …, 2017 
1079
Eating Disorders, 2006 
1080
European Eating Disorders …, 2008 
1081
Nature …, 2021 
1082


In the variable `publications`, we can see that for some papers, only either journal or publication year is given, while both journal and publication year are given for other papers. 

We can also see that the journal is often imcomplete, ending with `…`. 

In order to clean the variable `publications`, we want to split it into two separate variables `journal` and `publication_year` and remove the character `…`.

The helper function `split_publication()` does this for us.

In [50]:
hcp_df_gs = split_publication(hcp_df_gs)
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,journal,publication_year,snippet,hcp_name_short
0,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo",Journal of,2017,Malignant melanoma is an aggressive tumor sens...,A Schiza
1,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza",Clinical Cancer,2021,Purpose: This study analyzes the potential of ...,A Schiza
2,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Nieto, A Loskog",Scientific reports,2019,The purpose was to evaluate the potential of d...,A Schiza
3,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, S Mangsbo, J Wenthe",Oncotarget,2017,Background AdCD40L is an immunostimulatory gen...,A Schiza
4,Aglaia Schiza,10,predictive role her2status effectiveness endoc...,"A Schiza, D Mauri, I Fredriksson, A Wennstig",Breast Cancer Research,2021,Purpose There are conflicting results on the p...,A Schiza
...,...,...,...,...,...,...,...,...
1444,Frida Jakobsson,20,branschkontoplaner en studie av bas lantbruk o...,"S Axelsson, F Jakobsson, L Olsson",,2009,Titel: Branschkontoplaner-En studie av BAS lan...,F Jakobsson
1445,Frida Jakobsson,20,hållbarhetsredovisning ett gap mellan utbud oc...,"F Jakobsson, S Axelsson",,2011,Syfte: Att kartlägga utbredningen av hållbarhe...,F Jakobsson
1446,Frida Jakobsson,20,endometrios den långa vägen till diagnos,"M Gullbrand, F Jakobsson",,2016,Titel: Endometrios och den långa vägen till di...,F Jakobsson
1447,Frida Jakobsson,30,beräkning av ersättning för inkomstförlust vid...,"F Jakobsson, J Boo",,2006,Ersättning för inkomstförlust vid personskada ...,F Jakobsson


In [51]:
hcp_df_gs[['journal', 'publication_year']] = hcp_df_gs[['journal', 'publication_year']].replace('', np.NaN)
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,journal,publication_year,snippet,hcp_name_short
0,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo",Journal of,2017,Malignant melanoma is an aggressive tumor sens...,A Schiza
1,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza",Clinical Cancer,2021,Purpose: This study analyzes the potential of ...,A Schiza
2,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Nieto, A Loskog",Scientific reports,2019,The purpose was to evaluate the potential of d...,A Schiza
3,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, S Mangsbo, J Wenthe",Oncotarget,2017,Background AdCD40L is an immunostimulatory gen...,A Schiza
4,Aglaia Schiza,10,predictive role her2status effectiveness endoc...,"A Schiza, D Mauri, I Fredriksson, A Wennstig",Breast Cancer Research,2021,Purpose There are conflicting results on the p...,A Schiza
...,...,...,...,...,...,...,...,...
1444,Frida Jakobsson,20,branschkontoplaner en studie av bas lantbruk o...,"S Axelsson, F Jakobsson, L Olsson",,2009,Titel: Branschkontoplaner-En studie av BAS lan...,F Jakobsson
1445,Frida Jakobsson,20,hållbarhetsredovisning ett gap mellan utbud oc...,"F Jakobsson, S Axelsson",,2011,Syfte: Att kartlägga utbredningen av hållbarhe...,F Jakobsson
1446,Frida Jakobsson,20,endometrios den långa vägen till diagnos,"M Gullbrand, F Jakobsson",,2016,Titel: Endometrios och den långa vägen till di...,F Jakobsson
1447,Frida Jakobsson,30,beräkning av ersättning för inkomstförlust vid...,"F Jakobsson, J Boo",,2006,Ersättning för inkomstförlust vid personskada ...,F Jakobsson


In [52]:
# Change data type of publication_year to int
hcp_df_gs['publication_year'] = hcp_df_gs['publication_year'].astype('float')
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,journal,publication_year,snippet,hcp_name_short
0,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo",Journal of,2017.0,Malignant melanoma is an aggressive tumor sens...,A Schiza
1,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza",Clinical Cancer,2021.0,Purpose: This study analyzes the potential of ...,A Schiza
2,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Nieto, A Loskog",Scientific reports,2019.0,The purpose was to evaluate the potential of d...,A Schiza
3,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, S Mangsbo, J Wenthe",Oncotarget,2017.0,Background AdCD40L is an immunostimulatory gen...,A Schiza
4,Aglaia Schiza,10,predictive role her2status effectiveness endoc...,"A Schiza, D Mauri, I Fredriksson, A Wennstig",Breast Cancer Research,2021.0,Purpose There are conflicting results on the p...,A Schiza
...,...,...,...,...,...,...,...,...
1444,Frida Jakobsson,20,branschkontoplaner en studie av bas lantbruk o...,"S Axelsson, F Jakobsson, L Olsson",,2009.0,Titel: Branschkontoplaner-En studie av BAS lan...,F Jakobsson
1445,Frida Jakobsson,20,hållbarhetsredovisning ett gap mellan utbud oc...,"F Jakobsson, S Axelsson",,2011.0,Syfte: Att kartlägga utbredningen av hållbarhe...,F Jakobsson
1446,Frida Jakobsson,20,endometrios den långa vägen till diagnos,"M Gullbrand, F Jakobsson",,2016.0,Titel: Endometrios och den långa vägen till di...,F Jakobsson
1447,Frida Jakobsson,30,beräkning av ersättning för inkomstförlust vid...,"F Jakobsson, J Boo",,2006.0,Ersättning för inkomstförlust vid personskada ...,F Jakobsson


### `journal`

We look at the journals first.

In [53]:
print_index_and_value(hcp_df_gs, 'journal')

0
Journal of
1
Clinical Cancer
2
Scientific reports
3
Oncotarget
4
Breast Cancer Research
5
nan
6
nan
7
nan
8
nan
9
Clin Oncol Case Rep 4
10
Algorithms for
11
In silico
12
Journal of Surgery
13
JOJ Case
14
International Journal of
15
Breast Cancer
16
nan
17
nan
18
European journal of
19
Mechanism and machine theory
20
Breast cancer research and treatment
21
International Journal of Production Economics
22
International Journal of Production Economics
23
The International Journal of Advanced Manufacturing
24
International Journal of Production Economics
25
Journal of Intelligent Manufacturing
26
Engineering Applications of Artificial
27
Computers & Operations Research
28
International Journal of Production Research
29
Robotica
30
Journal of Heuristics
31
Journal of Clinical
32
International journal of
33
Breast cancer research and
34
Engineering Applications of Artificial Intelligence
35
Computers & Operations Research
36
Acta Oncologica
37
Artificial Intelligence in Engineering
38
Clin

1274
nan
1275
Acta
1276
nan
1277
The EMBO
1278
Nature genetics
1279
Nature
1280
Trends in pharmacological sciences
1281
The Journal of urology
1282
Scandinavian journal
1283
British journal of
1284
Melanoma
1285
Acta
1286
The Lancet Oncology
1287
Annals of surgical
1288
British journal of
1289
British journal of
1290
Trials
1291
International Journal of
1292
International Journal of
1293
nan
1294
International Journal of
1295
Report-Department of Education and Educational
1296
NU, Kalmar, Sweden (2008)
1297
NU, Kalmar, Sweden (2008)
1298
nan
1299
nan
1300
nan
1301
Education and
1302
PloS one
1303
PLoS
1304
Acta
1305
Acta
1306
British journal of
1307
Clinical Cancer
1308
Cancer research
1309
Breast cancer research
1310
European journal of
1311
Oncogene
1312
Clinical cancer
1313
Breast cancer research
1314
Journal of Clinical
1315
journal of cancer
1316
Modern
1317
Life sciences
1318
Measuring Business Excellence
1319
Association Australasia 1
1320
Resuscitation
1321
and management: New 

In order for journals to have the same format, we want to convert the journals to lower case.

In [54]:
hcp_df_gs['journal'] = hcp_df_gs["journal"].str.lower()
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,journal,publication_year,snippet,hcp_name_short
0,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo",journal of,2017.0,Malignant melanoma is an aggressive tumor sens...,A Schiza
1,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza",clinical cancer,2021.0,Purpose: This study analyzes the potential of ...,A Schiza
2,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Nieto, A Loskog",scientific reports,2019.0,The purpose was to evaluate the potential of d...,A Schiza
3,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, S Mangsbo, J Wenthe",oncotarget,2017.0,Background AdCD40L is an immunostimulatory gen...,A Schiza
4,Aglaia Schiza,10,predictive role her2status effectiveness endoc...,"A Schiza, D Mauri, I Fredriksson, A Wennstig",breast cancer research,2021.0,Purpose There are conflicting results on the p...,A Schiza
...,...,...,...,...,...,...,...,...
1444,Frida Jakobsson,20,branschkontoplaner en studie av bas lantbruk o...,"S Axelsson, F Jakobsson, L Olsson",,2009.0,Titel: Branschkontoplaner-En studie av BAS lan...,F Jakobsson
1445,Frida Jakobsson,20,hållbarhetsredovisning ett gap mellan utbud oc...,"F Jakobsson, S Axelsson",,2011.0,Syfte: Att kartlägga utbredningen av hållbarhe...,F Jakobsson
1446,Frida Jakobsson,20,endometrios den långa vägen till diagnos,"M Gullbrand, F Jakobsson",,2016.0,Titel: Endometrios och den långa vägen till di...,F Jakobsson
1447,Frida Jakobsson,30,beräkning av ersättning för inkomstförlust vid...,"F Jakobsson, J Boo",,2006.0,Ersättning för inkomstförlust vid personskada ...,F Jakobsson


### Rename `snippet` to `abstract`

In [55]:
hcp_df_gs = hcp_df_gs.rename(columns={'snippet': 'abstract'})
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,journal,publication_year,abstract,hcp_name_short
0,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo",journal of,2017.0,Malignant melanoma is an aggressive tumor sens...,A Schiza
1,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza",clinical cancer,2021.0,Purpose: This study analyzes the potential of ...,A Schiza
2,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Nieto, A Loskog",scientific reports,2019.0,The purpose was to evaluate the potential of d...,A Schiza
3,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, S Mangsbo, J Wenthe",oncotarget,2017.0,Background AdCD40L is an immunostimulatory gen...,A Schiza
4,Aglaia Schiza,10,predictive role her2status effectiveness endoc...,"A Schiza, D Mauri, I Fredriksson, A Wennstig",breast cancer research,2021.0,Purpose There are conflicting results on the p...,A Schiza
...,...,...,...,...,...,...,...,...
1444,Frida Jakobsson,20,branschkontoplaner en studie av bas lantbruk o...,"S Axelsson, F Jakobsson, L Olsson",,2009.0,Titel: Branschkontoplaner-En studie av BAS lan...,F Jakobsson
1445,Frida Jakobsson,20,hållbarhetsredovisning ett gap mellan utbud oc...,"F Jakobsson, S Axelsson",,2011.0,Syfte: Att kartlägga utbredningen av hållbarhe...,F Jakobsson
1446,Frida Jakobsson,20,endometrios den långa vägen till diagnos,"M Gullbrand, F Jakobsson",,2016.0,Titel: Endometrios och den långa vägen till di...,F Jakobsson
1447,Frida Jakobsson,30,beräkning av ersättning för inkomstförlust vid...,"F Jakobsson, J Boo",,2006.0,Ersättning för inkomstförlust vid personskada ...,F Jakobsson


###  `abstract`

Let us take a look at the abstracts.

In [57]:
print_index_and_value(hcp_df_gs, 'abstract')

0
Malignant melanoma is an aggressive tumor sensitive for immunotherapy such as checkpoint 
blockade antibodies. Still, most patients with late stage disease do not respond, and the side 
effects can be severe. Stimulation of the CD40 pathway to initiate anti-tumor immunity is a …
1
Purpose: This study analyzes the potential of stromal platelet-derived growth factor receptor-beta 
(PDGFRb) expression as biomarker for radiotherapy (RT) benefit on ipsilateral breast 
events (IBE) in ductal carcinoma in situ (DCIS). Improved identification of DCIS patients …
2
The purpose was to evaluate the potential of diffusion-weighted-magnetic resonance imaging 
(DW-MRI) and 18 F-fludeoxy-glucose-positron emission tomography integrated with CT 
(FDG-PET/CT) for prediction of overall survival (OS) following AdCD40L-immunotherapy in …
3
Background AdCD40L is an immunostimulatory gene therapy under evaluation for 
advanced melanoma, including ocular melanoma. Herein, we present the final data of a 
Phas

810
Purpose To assess the association between height and risk of cancer and cancer death. 
Methods The metabolic syndrome and cancer project is a prospective pooled cohort study 
of 585,928 participants from seven cohorts in Austria, Norway, and Sweden. Hazard ratios (HRs) …
811
It has been proposed that folate and polymorphisms of the enzyme methylenetetrahydrofolate 
reductase (MTHFR), which regulates influx of folate from DNA synthesis and repair to methylation 
reactions, are involved in the aetiology of cancer. To relate the MTHFR 677C→ T and …
812
BACKGROUND Androgens have been implicated in prostate tumorigenesis, but prospective 
studies have overall reported no association between circulating levels of androgens and 
risk of prostate cancer. However, some recent studies have shown that a high level of …
813
Previous studies have shown a decreased risk of prostate cancer for childless men; however, 
the cause of the association remains to be elucidated. The aim of our study was

The abstracts of the individual papers contain a lot of stopwords, i.e., words which are necessary to build a sentence but which do not give much meaning to a sentence, e.g., 'is', 'has', 'on', 'to' etc. We would like to remove these stopwords. In addition, we would like to remove the frequently showing `…` string, replace special characters and convert the entire abstracts to lower case.

The helper function `clean_abstract()` does this for us.

In [58]:
hcp_df_gs = clean_abstract(hcp_df_gs, gs=True)
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,journal,publication_year,abstract,hcp_name_short
0,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo",journal of,2017.0,malignant melanoma aggressive tumor sensitive ...,A Schiza
1,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza",clinical cancer,2021.0,purpose: this study analyzes potential stromal...,A Schiza
2,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Nieto, A Loskog",scientific reports,2019.0,the purpose evaluate potential diffusion-weigh...,A Schiza
3,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, S Mangsbo, J Wenthe",oncotarget,2017.0,background adcd40l immunostimulatory gene ther...,A Schiza
4,Aglaia Schiza,10,predictive role her2status effectiveness endoc...,"A Schiza, D Mauri, I Fredriksson, A Wennstig",breast cancer research,2021.0,purpose there conflicting results potential ro...,A Schiza
...,...,...,...,...,...,...,...,...
1444,Frida Jakobsson,20,branschkontoplaner en studie av bas lantbruk o...,"S Axelsson, F Jakobsson, L Olsson",,2009.0,titel: branschkontoplaner-en studie av bas lan...,F Jakobsson
1445,Frida Jakobsson,20,hållbarhetsredovisning ett gap mellan utbud oc...,"F Jakobsson, S Axelsson",,2011.0,syfte: att kartlägga utbredningen av hållbarhe...,F Jakobsson
1446,Frida Jakobsson,20,endometrios den långa vägen till diagnos,"M Gullbrand, F Jakobsson",,2016.0,titel: endometrios och den långa vägen till di...,F Jakobsson
1447,Frida Jakobsson,30,beräkning av ersättning för inkomstförlust vid...,"F Jakobsson, J Boo",,2006.0,ersättning för inkomstförlust vid personskada ...,F Jakobsson


We check the abstracts again.

In [59]:
print_index_and_value(hcp_df_gs, 'abstract')

0
malignant melanoma aggressive tumor sensitive immunotherapy checkpoint blockade antibodies. still, patients late stage disease respond, effects severe. stimulation cd40 pathway initiate anti-tumor immunity
1
purpose: this study analyzes potential stromal platelet-derived growth factor receptor-beta (pdgfrb) expression biomarker radiotherapy (rt) benefit ipsilateral breast events (ibe) ductal carcinoma situ (dcis). improved identification dcis patients
2
the purpose evaluate potential diffusion-weighted-magnetic resonance imaging (dw-mri) 18 f-fludeoxy-glucose-positron emission tomography integrated ct (fdg-pet/ct) prediction overall survival (os) following adcd40l-immunotherapy
3
background adcd40l immunostimulatory gene therapy evaluation advanced melanoma, including ocular melanoma. herein, present final data phase i/iia trial adcd40l combination low dose cyclophosphamide+/-radiation
4
purpose there conflicting results potential role her2-status efficacy aromatase inhibitors (ais) 

mutations methyl-cpg-binding protein-2 (mecp2) gene xq28 cause rett syndrome (rs). in previous mutation screening, mecp2 mutations 81% swedish classical rett women. in study, analyzed 22 patients
1104
the effect gene region chromosome 2q33 containing cd28 cytotoxic t‐lymphocyte associated (ctla4) genes investigated diseases chronic inflammatory nature. in addition celiac disease (cd), type i diabetes, grave’s disease,
1105
in study test hypothesis endogenous particles exhaled air (pex), non-invasively sampled lower airways, suited analysis respiratory tract lining fluid (rtlf) proteins, ie, surfactant protein a (sp-a) albumin. ten healthy volunteers
1106
psoriasis known heterogeneous disease far reported major psoriasis susceptibility loci chromosome 4q, 6p 17q. in study investigated reported gene locations nonparametric parametric linkage analysis large family set
1107
when preadipocytes differentiate adipocytes, differentiation-linked genes activated. lipoprotein lipase (lpl) genes i

### `num_articles`

In the above data cleaning steps, some papers were removed, e.g., when the name in `hcp_name` did not match any of the names given in `authors`. 

We now want to update the number of articles for each HCP provided in `num_articles`. 

The helper function `update_num_articles()` does this for us.

In [60]:
hcp_df_gs = update_num_articles(hcp_df_gs)
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,journal,publication_year,abstract,hcp_name_short
0,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo",journal of,2017.0,malignant melanoma aggressive tumor sensitive ...,A Schiza
1,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza",clinical cancer,2021.0,purpose: this study analyzes potential stromal...,A Schiza
2,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Nieto, A Loskog",scientific reports,2019.0,the purpose evaluate potential diffusion-weigh...,A Schiza
3,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, S Mangsbo, J Wenthe",oncotarget,2017.0,background adcd40l immunostimulatory gene ther...,A Schiza
4,Aglaia Schiza,10,predictive role her2status effectiveness endoc...,"A Schiza, D Mauri, I Fredriksson, A Wennstig",breast cancer research,2021.0,purpose there conflicting results potential ro...,A Schiza
...,...,...,...,...,...,...,...,...
1444,Frida Jakobsson,17,branschkontoplaner en studie av bas lantbruk o...,"S Axelsson, F Jakobsson, L Olsson",,2009.0,titel: branschkontoplaner-en studie av bas lan...,F Jakobsson
1445,Frida Jakobsson,17,hållbarhetsredovisning ett gap mellan utbud oc...,"F Jakobsson, S Axelsson",,2011.0,syfte: att kartlägga utbredningen av hållbarhe...,F Jakobsson
1446,Frida Jakobsson,17,endometrios den långa vägen till diagnos,"M Gullbrand, F Jakobsson",,2016.0,titel: endometrios och den långa vägen till di...,F Jakobsson
1447,Frida Jakobsson,17,beräkning av ersättning för inkomstförlust vid...,"F Jakobsson, J Boo",,2006.0,ersättning för inkomstförlust vid personskada ...,F Jakobsson


### Create new column `scraped_from`

In order to indicate that the papers in this data frame were scraped from Google Scholar, we create a new column `scraped_from` with the value `gs` for all the papers.

In [61]:
hcp_df_gs['scraped_from'] = 'gs'
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,journal,publication_year,abstract,hcp_name_short,scraped_from
0,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo",journal of,2017.0,malignant melanoma aggressive tumor sensitive ...,A Schiza,gs
1,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza",clinical cancer,2021.0,purpose: this study analyzes potential stromal...,A Schiza,gs
2,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Nieto, A Loskog",scientific reports,2019.0,the purpose evaluate potential diffusion-weigh...,A Schiza,gs
3,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, S Mangsbo, J Wenthe",oncotarget,2017.0,background adcd40l immunostimulatory gene ther...,A Schiza,gs
4,Aglaia Schiza,10,predictive role her2status effectiveness endoc...,"A Schiza, D Mauri, I Fredriksson, A Wennstig",breast cancer research,2021.0,purpose there conflicting results potential ro...,A Schiza,gs
...,...,...,...,...,...,...,...,...,...
1444,Frida Jakobsson,17,branschkontoplaner en studie av bas lantbruk o...,"S Axelsson, F Jakobsson, L Olsson",,2009.0,titel: branschkontoplaner-en studie av bas lan...,F Jakobsson,gs
1445,Frida Jakobsson,17,hållbarhetsredovisning ett gap mellan utbud oc...,"F Jakobsson, S Axelsson",,2011.0,syfte: att kartlägga utbredningen av hållbarhe...,F Jakobsson,gs
1446,Frida Jakobsson,17,endometrios den långa vägen till diagnos,"M Gullbrand, F Jakobsson",,2016.0,titel: endometrios och den långa vägen till di...,F Jakobsson,gs
1447,Frida Jakobsson,17,beräkning av ersättning för inkomstförlust vid...,"F Jakobsson, J Boo",,2006.0,ersättning för inkomstförlust vid personskada ...,F Jakobsson,gs


### Save cleaned Google Scholar data

In [84]:
# Create results folder
route0 = "../processed_data"

if not os.path.exists(route0):
    os.mkdir(route0)

print(f"saving file corresponding to results_gs.csv")
hcp_df_gs.to_csv(f"{route0}/results_gs.csv", index = False)

saving file corresponding to results_gs.csv


In [85]:
# Read in data scraped from GS
hcp_df_gs = pd.read_csv(f"{route0}/results_gs.csv")
hcp_df_gs.shape

(1449, 9)

In [86]:
hcp_df_gs

Unnamed: 0,hcp_name,num_articles,title,authors,journal,publication_year,abstract,hcp_name_short,scraped_from
0,Aglaia Schiza,10,adenovirusmediated cd40l gene transfer increas...,"A Schiza, J Wenthe, S Mangsbo",journal of,2017.0,malignant melanoma aggressive tumor sensitive ...,A Schiza,gs
1,Aglaia Schiza,10,high pdgfrb expression predicts resistance rad...,"C Strell, D Folkvaljon, E Holmberg, A Schiza",clinical cancer,2021.0,purpose: this study analyzes potential stromal...,A Schiza,gs
2,Aglaia Schiza,10,evaluation diffusionweighted mri fdgpetct asse...,"A Schiza, S Irenaeus, F Nieto, A Loskog",scientific reports,2019.0,the purpose evaluate potential diffusion-weigh...,A Schiza,gs
3,Aglaia Schiza,10,local irradiation enhance effect immunostimula...,"S Irenaeus, A Schiza, S Mangsbo, J Wenthe",oncotarget,2017.0,background adcd40l immunostimulatory gene ther...,A Schiza,gs
4,Aglaia Schiza,10,predictive role her2status effectiveness endoc...,"A Schiza, D Mauri, I Fredriksson, A Wennstig",breast cancer research,2021.0,purpose there conflicting results potential ro...,A Schiza,gs
...,...,...,...,...,...,...,...,...,...
1444,Frida Jakobsson,17,branschkontoplaner en studie av bas lantbruk o...,"S Axelsson, F Jakobsson, L Olsson",,2009.0,titel: branschkontoplaner-en studie av bas lan...,F Jakobsson,gs
1445,Frida Jakobsson,17,hållbarhetsredovisning ett gap mellan utbud oc...,"F Jakobsson, S Axelsson",,2011.0,syfte: att kartlägga utbredningen av hållbarhe...,F Jakobsson,gs
1446,Frida Jakobsson,17,endometrios den långa vägen till diagnos,"M Gullbrand, F Jakobsson",,2016.0,titel: endometrios och den långa vägen till di...,F Jakobsson,gs
1447,Frida Jakobsson,17,beräkning av ersättning för inkomstförlust vid...,"F Jakobsson, J Boo",,2006.0,ersättning för inkomstförlust vid personskada ...,F Jakobsson,gs
