# Import libraries

In [1]:
import numpy as np
import pandas as pd
import re
import json
import requests # https://docs.python-requests.org/en/latest
from bs4 import BeautifulSoup # https://realpython.com/beautiful-soup-web-scraper-python
from googlesearch import search # https://www.geeksforgeeks.org/performing-google-search-using-python-code
import time

# Selenium, Chromedriver
# https://www.geeksforgeeks.org/how-to-install-selenium-in-python
# https://chromedriver.chromium.org
# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

# Load Bik data

In [2]:
# Read TSV file in Python: https://www.geeksforgeeks.org/simple-ways-to-read-tsv-files-in-python
# UnicodeDecodeError: https://stackoverflow.com/questions/22216076/unicodedecodeerror-utf8-codec-cant-decode-byte-0xa5-in-position-0-invalid-s

bik_df = pd.read_csv('Bik.tsv', sep='\t', encoding='unicode-escape')
# bik_df

# Clean data

In [3]:
# Drop empty rows: https://www.kite.com/python/answers/how-to-drop-empty-rows-from-a-pandas-dataframe-in-python

bik_df.dropna(subset = ["Authors"], inplace=True)
# bik_df

In [4]:
# Fix DOI character encoding

bik_df["DOI"] = [i.replace("Ð","-") for i in bik_df["DOI"]]
# bik_df

In [5]:
# Remove leading space from Index 193's DOI
# Update one cell in dataframe: https://re-thought.com/how-to-change-or-update-a-cell-value-in-python-pandas-dataframe

bik_df.at[193,'DOI'] = bik_df["DOI"][193].strip()
# bik_df["DOI"][193]

In [6]:
# Standardize Year as int type

years = bik_df["Year"].apply(int)

# Drop original Year column: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html
# Insert new Year column at specific index: https://stackoverflow.com/questions/18674064/how-do-i-insert-a-column-at-a-specific-column-index-in-pandas

bik_df = bik_df.drop(columns=["Year"])
bik_df.insert(4,"Year",years)
# bik_df

In [7]:
# Standardize Month as numeric str type if possible, empty otherwise

months = []
for month in list(bik_df["Month"]):
    if type(month) is float:
        months.append("")
    elif month[0] == "0":
        months.append(str(month[1]))
    else:
        months.append("")

# Drop original Month column
# Insert new Month column at specific index

bik_df = bik_df.drop(columns=["Month"])
bik_df.insert(5, "Month", months)
# bik_df

In [8]:
# Remove incorrect value

bik_df.at[92,'3'] = ""
bik_df.at[94,'3'] = ""

In [9]:
# Convert booleans from float to str type
# Replace column values in dataframe: https://www.kite.com/python/answers/how-to-replace-column-values-in-a-pandas-dataframe-in-python

def bool_flt_to_str(df,col_label):
    for i in df[col_label]:
        if i == 1 or i == "1":
            df[col_label].replace({i:"1"},inplace=True)
        else:
            df[col_label].replace({i:""},inplace=True)
            
bool_flt_to_str(bik_df,"0")
bool_flt_to_str(bik_df,"1")
bool_flt_to_str(bik_df,"2")
bool_flt_to_str(bik_df,"3")

In [10]:
# Convert NaN from float to str type

def nan_to_str(df,col_label):
    for i in df[col_label]:
        if type(i) == float:
            df[col_label].replace({i:""},inplace=True)

nan_to_str(bik_df,"FINDINGS")
nan_to_str(bik_df,"Correction Date")

In [11]:
# Replace incorrect value

bik_df.at[56,"Reported"] = "1"
bik_df.at[57,"Reported"] = "1"

In [12]:
bik_df

Unnamed: 0,Authors,Title,Citation,DOI,Year,Month,0,1,2,3,FINDINGS,Reported,Correction Date,Retraction,Correction,No Action,SUM \nCompleted
0,"Inka Regine Weingaertner, Sarah Koutnik, Herma...",Chronic Morphine Treatment Attenuates Cell Gro...,"PLOS One, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053510,2013,1,1,,1,,"duplicated blot, shifted microscopy",1,10/09/2015,,1.0,,1.0
1,"Jessica M. Esparza, Eileen OÕToole, Linya Li, ...",Katanin Localization Requires Triplet Microtub...,"PLOS ONE, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053940,2013,1,,1,,,duplicated colony strip,1,01/15/2016,,1.0,,1.0
2,"Sreedevi Avasarala, Fangfang Zhang, Guangliang...",Curcumin Modulates the Inflammatory Response a...,"PLOS ONE, February 2013 | Volume 8 | Issue 2 |...",10.1371/journal.pone.0057285,2013,2,,,1,,reuse of one lane in different blot experiment,1,8/4/2015,,1.0,,1.0
3,"Rounak Nassirpour, Pramod P. Mehta, Sangita M....",miR-221 Promotes Tumorigenesis in Human Triple...,"PLOS ONE, April 2013 | Volume 8 | Issue 4 | e6...",10.1371/journal.pone.0062170,2013,4,1,,,1,Duplicated / mirrored bands in blot,1,4/10/2017,1.0,,,1.0
4,"Jianuo Liu, Peng Xu, Cory Collins, Han Liu, J...",HIV-1 Tat Protein Increases Microglial Outward...,"PLOS ONE, May 2013 | Volume 8 | Issue 5 | e649...",10.1371/journal.pone.0064904,2013,5,,1,,,multiple duplicated microscopy images,1,9/19/2014,,1.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,"Nathalie Delgehyr, Uta Wieland, He_lne Rangon...","Drosophila Mgr, a Prefoldin subunit cooperatin...","PNAS | April 10, 2012 | vol. 109 | no. 15 | 57...",10.1073/pnas.1108537109,2012,,,1,,,,1,11/14/2016,,1.0,,1.0
210,"Rajesh K. Grover, Julong Cheng, Yingjie Peng, ...",The costimulatory immunogen LPS induces the B-...,"PNAS _ April 17, 2012 _ vol. 109 _ no. 16, 603...",10.1073/pnas.1202214109,2012,,,1,,,,1,10/17/2016,,1.0,,1.0
211,"Kevin Y. Lee, Yuji Yamamoto, Jeremie Boucher, ...",Shox2 is a molecular determinant of depot-spec...,"PNAS | July 9, 2013 | vol. 110 | no. 28 | 1140...",10.1073/pnas.1310331110,2013,,,1,,,,1,4/11/2016,,1.0,,1.0
212,"Shuyu Ren, Bryce G. Johnson, Yujiro Kida, Coli...",LRP-6 is a coreceptor for multiple fibrogenic ...,"PNAS | January22,2013 | vol.110 | no.4 | 1441",10.1073/pnas.1211179110,2013,,,,1,,,1,11/14/2016,,1.0,,1.0


In [13]:
# print(Force error here.)

# Functions

## Get URLs
- List of URLs: `bik_URLS`
- Updated df, URL column: `bik_df["URL"]`

In [14]:
# Get URLs from DOI and PMID number

def get_URLS(df):
    URLS = []
    for doi in list(df["DOI"]):
        if doi[:3] == "10.":
            URL = "http://doi.org/" + doi
            URLS.append(URL)
        else: # begins with "PMID: "
            URL = "http://pubmed.ncbi.nlm.nih.gov/" + doi[6:]
            URLS.append(URL)
    return URLS

In [15]:
# Save URLs to variable

bik_URLS = get_URLS(bik_df)
# bik_URLS

In [16]:
# Add URLs to dataframe

bik_df["URL"] = bik_URLS
bik_df

Unnamed: 0,Authors,Title,Citation,DOI,Year,Month,0,1,2,3,FINDINGS,Reported,Correction Date,Retraction,Correction,No Action,SUM \nCompleted,URL
0,"Inka Regine Weingaertner, Sarah Koutnik, Herma...",Chronic Morphine Treatment Attenuates Cell Gro...,"PLOS One, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053510,2013,1,1,,1,,"duplicated blot, shifted microscopy",1,10/09/2015,,1.0,,1.0,http://doi.org/10.1371/journal.pone.0053510
1,"Jessica M. Esparza, Eileen OÕToole, Linya Li, ...",Katanin Localization Requires Triplet Microtub...,"PLOS ONE, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053940,2013,1,,1,,,duplicated colony strip,1,01/15/2016,,1.0,,1.0,http://doi.org/10.1371/journal.pone.0053940
2,"Sreedevi Avasarala, Fangfang Zhang, Guangliang...",Curcumin Modulates the Inflammatory Response a...,"PLOS ONE, February 2013 | Volume 8 | Issue 2 |...",10.1371/journal.pone.0057285,2013,2,,,1,,reuse of one lane in different blot experiment,1,8/4/2015,,1.0,,1.0,http://doi.org/10.1371/journal.pone.0057285
3,"Rounak Nassirpour, Pramod P. Mehta, Sangita M....",miR-221 Promotes Tumorigenesis in Human Triple...,"PLOS ONE, April 2013 | Volume 8 | Issue 4 | e6...",10.1371/journal.pone.0062170,2013,4,1,,,1,Duplicated / mirrored bands in blot,1,4/10/2017,1.0,,,1.0,http://doi.org/10.1371/journal.pone.0062170
4,"Jianuo Liu, Peng Xu, Cory Collins, Han Liu, J...",HIV-1 Tat Protein Increases Microglial Outward...,"PLOS ONE, May 2013 | Volume 8 | Issue 5 | e649...",10.1371/journal.pone.0064904,2013,5,,1,,,multiple duplicated microscopy images,1,9/19/2014,,1.0,,1.0,http://doi.org/10.1371/journal.pone.0064904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,"Nathalie Delgehyr, Uta Wieland, He_lne Rangon...","Drosophila Mgr, a Prefoldin subunit cooperatin...","PNAS | April 10, 2012 | vol. 109 | no. 15 | 57...",10.1073/pnas.1108537109,2012,,,1,,,,1,11/14/2016,,1.0,,1.0,http://doi.org/10.1073/pnas.1108537109
210,"Rajesh K. Grover, Julong Cheng, Yingjie Peng, ...",The costimulatory immunogen LPS induces the B-...,"PNAS _ April 17, 2012 _ vol. 109 _ no. 16, 603...",10.1073/pnas.1202214109,2012,,,1,,,,1,10/17/2016,,1.0,,1.0,http://doi.org/10.1073/pnas.1202214109
211,"Kevin Y. Lee, Yuji Yamamoto, Jeremie Boucher, ...",Shox2 is a molecular determinant of depot-spec...,"PNAS | July 9, 2013 | vol. 110 | no. 28 | 1140...",10.1073/pnas.1310331110,2013,,,1,,,,1,4/11/2016,,1.0,,1.0,http://doi.org/10.1073/pnas.1310331110
212,"Shuyu Ren, Bryce G. Johnson, Yujiro Kida, Coli...",LRP-6 is a coreceptor for multiple fibrogenic ...,"PNAS | January22,2013 | vol.110 | no.4 | 1441",10.1073/pnas.1211179110,2013,,,,1,,,1,11/14/2016,,1.0,,1.0,http://doi.org/10.1073/pnas.1211179110


## Classify site structure
- List of site structures: `bik_site_struct`
- Updated df, site structure column: `bik_df["Struct"]`

In [17]:
# Classify site structure from DOI
# 200 papers, 15 unique websites, 13 unique website structures

# Dictionary of home websites derived from DOI numbers
site_struct_dict = {"PLOS":["1371"],
                   "ASM":["1128"],
                   "Hindawi":["1155"],
                   "Wiley":["1002","1111"],
                   "BioMed":["1186"],
                   "Cancer":["1016/S016","1016/j.lu","1016/j.cc"],
                   "SciDirect":["1016/j.ja","1016/j.cy"],
                   "Science":["1126"],
                   "Nature":["1038"],
                   "RUPress":["1083"],
                   "Spandidos":["3892"],
                   "PNAS":["1073"],
                   "PubMed":["PMID"]}

# Get key from value in dict: https://www.geeksforgeeks.org/python-get-key-from-value-in-dictionary
def get_key(dic,val):
    for key,value in dic.items():
        if val in value:
            return key

# Get site structures
def get_site_struct(df):
    site_structs = []
    for doi in list(df["DOI"]):
        if doi[:3] == "10.":
            if doi[3:7] == "1016":
                site_structs.append(get_key(site_struct_dict,doi[3:12]))
            else:
                site_structs.append(get_key(site_struct_dict,doi[3:7]))
        else:
            site_structs.append("PubMed")         
    return site_structs

bik_site_struct = get_site_struct(bik_df)
# bik_site_struct

In [18]:
# Add site structures to dataframe

bik_df["Home Site"] = bik_site_struct
# bik_df

## First authors
- List of first authors: `first_authors`

In [19]:
# List of first authors from each paper

first_authors = []
for authors in bik_df["Authors"]:
    authors = authors.split(",")
    author = authors[0]
    first_authors.append(author)
# first_authors

In [20]:
# print(Force error here.)

SyntaxError: invalid syntax (3965129095.py, line 1)

# Add features
- `bik_df["Lab Size"]`: list of lab sizes for each author
- `bik_df["Pub Rate"]`: list of pub rates for each author
- `bik_df["Other Journals"]`: list of lists of other journals published in for each author
- `bik_df["First Author Affiliation"]`: single str - affiliation university
- `bik_df["First Author Career Duration"]`: single float - career duration
- `bik_df["First Author Degree"]`: single str - highest degree obtained
- `bik_df["First Author Degree Area"]`: single str - degree area

## All authors

Lab size, publication rate, other journals published in

In [None]:
# Extract list of journals published in

def get_journal_stats(soup_results):
          
    # Extract list of journal names for both web pages that have journals and those that don't 
    journal_raw = soup_results.find_all("div",class_="nova-legacy-e-text nova-legacy-e-text--size-m nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit")
    
    #Creating empty lists to store journals
    journal_names = []
    test_list = []
    journals_final = []
    
    #Get text from HTML tag
    for i in journal_raw: 
        names = i.text
        journal_names.append(names)
        
        '''
        If 'Already a member?' appears in the list with extracted text,
        the webpage doesn't have 'Top Journals', so these webpages are skipped
        '''
        
        #create a list that finds the 'Already a member? in the list'
        for element in journal_names: 
            if 'Already a member?' in element: 
                test_list.append('True')
            else:
                test_list.append('False')  
                
    test_list =  ['' if i == 'False' else i for i in test_list]
    
    test_list = test_list

    #Skip webpages that don't have information about journals and print journals for the rest of  
    for i in journal_raw:
        
        if 'True' in test_list: 
            continue 
        else: 
            names = re.sub(r"\([^()]*\)", "", i.text)
            journals_final.append(names)

    return journals_final

In [None]:
all_authors = []
all_authors_URLS = []
labs = []
rates = []
journals = []

# Extract lab size, publication rate, other journals published in for all authors

# For each paper
for i in bik_df["Authors"]:
    
    i_to_list = i.split(", ")
    
    authors_in_paper = []
    authors_in_paper_URLS = []
    labs_in_paper = []
    rates_in_paper = []
    jours_in_paper = []
    
    # For each author of each paper
    for j in i_to_list:
        
        print(j)
        authors_in_paper.append(j)
        
        # Google Search query for URL to Research Gate profile     
        query = j + "researchgate"
        for x in search(query, tld="com", num=1, stop=1, pause=2):
            
            # Save URL and scrape page
            authors_in_paper_URLS.append(x)            
            print(x)
            url = x
            # https://stackoverflow.com/questions/32937590/how-to-fake-javascript-enabled-in-python-requests-beautifulsoup
            s = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service = s)
            driver.get(url)
            html = driver.page_source
            soup = BeautifulSoup(html)
            soup = ""
            
            ########## LAB SIZE ##########
            
            # Author counter
            labs_in_paper.append(1)
            
            ########## PUBLICATION RATE ##########
            
            # Extract list of pub dates
            dates_raw = soup.find_all("li", class_="nova-legacy-e-list__item nova-legacy-v-publication-item__meta-data-item")
            
            # Extract years, convert str to int
            years = []
            for y in dates_raw:
                date = y.text
                # print(date)
                if len(date) == 8 and date[3] == " ":
                    year = int(date[4:])
                    years.append(year)
            
            # Calculate publication rate
            try:
                rate = (max(years)-min(years))/len(years)
            except:
                rate = np.nan
            
            rates_in_paper.append(rate)
            print(rate)
            
            ########## OTHER JOURNALS ##########
            
            # Get other journals by calling previous function
            jour = get_journal_stats(soup)
            jours_in_paper.append(jour)
            print(jour)
            
            time.sleep(60)
    
    all_authors.append(authors_in_paper) # List of authors
    all_authors_URLS.append(authors_in_paper_URLS) # List of Research Gate URLs
    
    print("----------")
    
    # Replace author counter with sum of authors for lab size
    # Replace integer in list: https://appdividend.com/2020/05/13/python-list-replace-replace-string-integer-in-list
    # labs_in_paper = [len(labs_in_paper) if i == 1 else i for i in labs_in_paper] # As list of ints
    labs_in_paper = sum(labs_in_paper) # As single int
    labs.append(labs_in_paper) # Labs
    print(labs,"\n")
    
    rates.append(rates_in_paper) # Rates
    print(rates,"\n")
    
    journals.append(jours_in_paper) # Journals
    print(journals,"\n")
    
    time.sleep(60)
        
# all_authors_URLS
print(labs)
print(rates)
print(journals)

In [21]:
# Saved features to external Jupyter Notebook for easy access
# Import features

%store -r labs
%store -r rates
%store -r journals

In [22]:
# Add features to dataframe
# Only add when all instances completed, otherwise mismatched rows

bik_df["Lab Size"] = labs
bik_df["Pub Rate"] = rates
bik_df["Other Journals"] = journals
# bik_df

## First author only

Affiliation university, duration of career, highest degree, degree area

In [None]:
# Get list of URLs to first author ResearchGate profiles
# Google Search with Python: https://www.geeksforgeeks.org/performing-google-search-using-python-code

first_author_URLS = []
for i in range(len(first_authors)):
    query = first_authors[i] + " researchgate"
    for j in search(query, tld="com", num=1, stop=1, pause=2):
        first_author_URLS.append(j)
        print(j) ### FEEDBACK
# first_author_URLS

In [None]:
# Saved feature to external Jupyter Notebook for easy access
# Import feature

%store -r first_author_URLS

In [None]:
# Extract affilition university, career duration, highest degree, degree area for first author

def get_first_author_features(RG_url):
    
    # Scrape page
    url = RG_url
    s = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service = s)
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html)
        
    # Extract affiliation university, highest degree, degree area
    try:
        # Author profile curated by Research Gate
        if url.split('/')[3] == "scientific-contributions":
            try:
                aff = soup.find("h1", class_="nova-legacy-e-text nova-legacy-e-text--size-xl nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-grey-600 sci-con__header-title").find_all("span", attrs={"style":"color:#111111"})[1].text
                # aff = json.loads(soup.find("script",{"type":"application/ld+json"}).text)["affiliation"]["name"]
            except:
                aff = ""
            deg = ""
            try:
                dept = json.loads(soup.find("script",{"type":"application/ld+json"}).text)["affiliation"]["department"]["name"]
            except:
                dept = ""
        # Registered Research Gate author profile
        elif url.split('/')[3] == "profile":
            try:
                aff = soup.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-m nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-grey-600").find("span").find("span").text
            except:
                aff = ""
            try:
                deg = soup.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-m nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-grey-600 title").text.split(",")[0]
            except:
                deg = ""
            try:
                dept = soup.find("div", class_="nova-legacy-e-text nova-legacy-e-text--size-m nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-grey-600").find_all("span")[2].find("span",text=re.compile("·")).next_sibling
            except:
                dept =""
        else:
            aff = ""
            deg = ""
            dept = ""
    except: # Wrong URL format from automatic Google search
        aff = ""
        deg = ""
        dept = ""
    
    # Extract list of pub dates
    dates_raw = soup.find_all("li", class_="nova-legacy-e-list__item nova-legacy-v-publication-item__meta-data-item")
    
    # Extract years, convert str to int
    years = []
    for i in dates_raw:
        date = i.text
        if len(date) == 8 and date[3] == " ":
            year = int(date[4:])
            years.append(year)
    
    # Calculate career duration
    try:
        dur = (max(years)-min(years))
    except:
        dur = np.nan
    
    time.sleep(60)
    
    return [aff, dur, deg, dept]

In [None]:
# Get features for each first author

affiliation = []
duration = []
degree = []
department = []

counter = 135
for i in first_author_URLS[135:]:
    x = get_first_author_features(i)
    print("INDEX: ",counter)
    counter += 1
    print(i) ### FEEDBACK
    print(x) ### FEEDBACK
    affiliation.append(x[0])
    duration.append(x[1])
    degree.append(x[2])
    department.append(x[3])

In [23]:
# Saved features to external Jupyter Notebook for easy access
# Import features

%store -r affiliation
%store -r duration
%store -r degree
%store -r department

In [24]:
# Add features to dataframe
# Only add when all instances completed, otherwise mismatched rows
    
bik_df["First Author Affiliation"] = affiliation
bik_df["First Author Career duration"] = duration
bik_df["First Author Degree"] = degree
bik_df["First Author Degree Area"] = department

bik_df

Unnamed: 0,Authors,Title,Citation,DOI,Year,Month,0,1,2,3,...,SUM \nCompleted,URL,Home Site,Lab Size,Pub Rate,Other Journals,First Author Affiliation,First Author Career duration,First Author Degree,First Author Degree Area
0,"Inka Regine Weingaertner, Sarah Koutnik, Herma...",Chronic Morphine Treatment Attenuates Cell Gro...,"PLOS One, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053510,2013,1,1,,1,,...,1.0,http://doi.org/10.1371/journal.pone.0053510,PLOS,3,"[0.5, nan, nan]","[[PLoS ONE , Naunyn-Schmiedeberg's Archives of...",Ludwig-Maximilians-University of Munich,2.0,,
1,"Jessica M. Esparza, Eileen OÕToole, Linya Li, ...",Katanin Localization Requires Triplet Microtub...,"PLOS ONE, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053940,2013,1,,1,,,...,1.0,http://doi.org/10.1371/journal.pone.0053940,PLOS,7,"[0.7647058823529411, nan, nan, 0.23, nan, nan,...","[[PLoS ONE , Molecular Biology of the Cell , C...",Washington University in St. Louis,13.0,,Department of Genetics
2,"Sreedevi Avasarala, Fangfang Zhang, Guangliang...",Curcumin Modulates the Inflammatory Response a...,"PLOS ONE, February 2013 | Volume 8 | Issue 2 |...",10.1371/journal.pone.0057285,2013,2,,,1,,...,1.0,http://doi.org/10.1371/journal.pone.0057285,PLOS,6,"[nan, 0.09090909090909091, nan, nan, nan, nan]","[[], [Current Developments in Nutrition , The ...",University of Illinois at Chicago,11.0,,Department of Dental Medicine
3,"Rounak Nassirpour, Pramod P. Mehta, Sangita M....",miR-221 Promotes Tumorigenesis in Human Triple...,"PLOS ONE, April 2013 | Volume 8 | Issue 4 | e6...",10.1371/journal.pone.0062170,2013,4,1,,,1,...,1.0,http://doi.org/10.1371/journal.pone.0062170,PLOS,4,"[nan, nan, nan, nan]","[[], [], [], []]",,10.0,,
4,"Jianuo Liu, Peng Xu, Cory Collins, Han Liu, J...",HIV-1 Tat Protein Increases Microglial Outward...,"PLOS ONE, May 2013 | Volume 8 | Issue 5 | e649...",10.1371/journal.pone.0064904,2013,5,,1,,,...,1.0,http://doi.org/10.1371/journal.pone.0064904,PLOS,7,"[nan, nan, nan, nan, nan, nan, nan]","[[], [], [], [], [], [], []]",University of Nebraska Medical Center,19.0,,Department of Pharmacology and Experimental Ne...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,"Nathalie Delgehyr, Uta Wieland, He_lne Rangon...","Drosophila Mgr, a Prefoldin subunit cooperatin...","PNAS | April 10, 2012 | vol. 109 | no. 15 | 57...",10.1073/pnas.1108537109,2012,,,1,,,...,1.0,http://doi.org/10.1073/pnas.1108537109,PNAS,12,"[nan, 0.38095238095238093, nan, nan, nan, nan,...","[[], [British Journal of Dermatology , Journal...",Ecole Normale Supérieure de Paris,20.0,,Département de Biologie
210,"Rajesh K. Grover, Julong Cheng, Yingjie Peng, ...",The costimulatory immunogen LPS induces the B-...,"PNAS _ April 17, 2012 _ vol. 109 _ no. 16, 603...",10.1073/pnas.1202214109,2012,,,1,,,...,1.0,http://doi.org/10.1073/pnas.1202214109,PNAS,10,"[nan, nan, nan, 0.35365853658536583, nan, nan,...","[[], [], [], [], [], [], [Publications, Reads ...",Delhi State Cancer Institute,35.0,Dr,Clinical Oncology
211,"Kevin Y. Lee, Yuji Yamamoto, Jeremie Boucher, ...",Shox2 is a molecular determinant of depot-spec...,"PNAS | July 9, 2013 | vol. 110 | no. 28 | 1140...",10.1073/pnas.1310331110,2013,,,1,,,...,1.0,http://doi.org/10.1073/pnas.1310331110,PNAS,8,"[0.26153846153846155, 0.7142857142857143, nan,...","[[], [Proceedings of the National Academy of S...",Harvard University,17.0,PhD,
212,"Shuyu Ren, Bryce G. Johnson, Yujiro Kida, Coli...",LRP-6 is a coreceptor for multiple fibrogenic ...,"PNAS | January22,2013 | vol.110 | no.4 | 1441",10.1073/pnas.1211179110,2013,,,,1,,...,1.0,http://doi.org/10.1073/pnas.1211179110,PNAS,11,"[0.5714285714285714, nan, 0.16666666666666666,...","[[], [], [Biochemical and Biophysical Research...",University of Washington Seattle,6.0,,


In [None]:
# print(Force error here.)

## Optional: Add missing data

In [None]:
# Extract affiliated university through alternative method if initial method unsuccessful
# Only run for blank values

def aff_backup(df, indx):
    
    # Scrape page
    struct = df.loc[indx]["Home Site"]
    url = df.loc[indx]["URL"]
    # https://stackoverflow.com/questions/32937590/how-to-fake-javascript-enabled-in-python-requests-beautifulsoup
    s = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service = s)
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html)

    # Extract affiliation based on site structure
    if struct in ["PLOS","Wiley","BioMed","Cancer","RUPress","Spandidos"]:
        try:
            aff = soup.find("meta", attrs={"name":"citation_author_institution"})["content"]
        except:
            aff = ""
    elif struct == "ASM":
        try:
            aff = soup.find("div", {"property":"affiliation"}).find("span").text
        except:
            aff = ""
    elif struct == "Hindawi":
        try:
            aff = soup.find("div", class_="sc-fHxwqH dMuGGA isHide").find("span").text
        except:
            aff = ""
    elif struct == "SciDirect":
        try:
            x = json.loads(soup.find("script",{"type":"application/json"}).text)["authors"]["content"][0]["$$"]
            y = next(item for item in x if item["#name"] == "affiliation")
            aff = y["$$"][1]["_"]
        except IndexError:
            aff = y["$$"][0]["_"]
        except:
            aff = ""
    elif struct == "Science":
        try:
            aff = soup.find("div", {"property":"organization"}).text
        except:
            aff = ""
    elif struct == "Nature":
        try:
            aff = json.loads(soup.find("script",{"type":"application/ld+json"}).text)["mainEntity"]["author"][0]["affiliation"][0]["name"]
        except:
            aff = ""
    elif struct == "PNAS":
        try:
            aff = soup.find("address").text
        except:
            aff = ""
    elif struct == "PubMed":
        try:
            aff = soup.find("a",class_="affiliation-link")["title"]
        except:
            aff = ""
    else:
        aff = ""

    return aff

In [None]:
# Get list of indices with blanks in First Author Affiliation column: https://www.statology.org/pandas-get-index-of-row
blank_affs = bik_df.index[bik_df["First Author Affiliation"]==""].tolist()
blank_affs

In [None]:
# For every blank First Author Affiliation value, call aff_backup
for b in blank_affs:
    address = aff_backup(bik_df,b) # Apply alternative method
    address = [i if "Univers" in i else "" for i in address.split(",")] # Extract university name from address, not perfect, very restrictive
    while "" in address:
        address.remove("")
    if len(address) != 0:
        bik_df.at[b,"First Author Affiliation"] = address[0] # If university name extracted, replace blank value in dataframe
        # print(address[0])

In [25]:
bik_df

Unnamed: 0,Authors,Title,Citation,DOI,Year,Month,0,1,2,3,...,SUM \nCompleted,URL,Home Site,Lab Size,Pub Rate,Other Journals,First Author Affiliation,First Author Career duration,First Author Degree,First Author Degree Area
0,"Inka Regine Weingaertner, Sarah Koutnik, Herma...",Chronic Morphine Treatment Attenuates Cell Gro...,"PLOS One, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053510,2013,1,1,,1,,...,1.0,http://doi.org/10.1371/journal.pone.0053510,PLOS,3,"[0.5, nan, nan]","[[PLoS ONE , Naunyn-Schmiedeberg's Archives of...",Ludwig-Maximilians-University of Munich,2.0,,
1,"Jessica M. Esparza, Eileen OÕToole, Linya Li, ...",Katanin Localization Requires Triplet Microtub...,"PLOS ONE, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053940,2013,1,,1,,,...,1.0,http://doi.org/10.1371/journal.pone.0053940,PLOS,7,"[0.7647058823529411, nan, nan, 0.23, nan, nan,...","[[PLoS ONE , Molecular Biology of the Cell , C...",Washington University in St. Louis,13.0,,Department of Genetics
2,"Sreedevi Avasarala, Fangfang Zhang, Guangliang...",Curcumin Modulates the Inflammatory Response a...,"PLOS ONE, February 2013 | Volume 8 | Issue 2 |...",10.1371/journal.pone.0057285,2013,2,,,1,,...,1.0,http://doi.org/10.1371/journal.pone.0057285,PLOS,6,"[nan, 0.09090909090909091, nan, nan, nan, nan]","[[], [Current Developments in Nutrition , The ...",University of Illinois at Chicago,11.0,,Department of Dental Medicine
3,"Rounak Nassirpour, Pramod P. Mehta, Sangita M....",miR-221 Promotes Tumorigenesis in Human Triple...,"PLOS ONE, April 2013 | Volume 8 | Issue 4 | e6...",10.1371/journal.pone.0062170,2013,4,1,,,1,...,1.0,http://doi.org/10.1371/journal.pone.0062170,PLOS,4,"[nan, nan, nan, nan]","[[], [], [], []]",,10.0,,
4,"Jianuo Liu, Peng Xu, Cory Collins, Han Liu, J...",HIV-1 Tat Protein Increases Microglial Outward...,"PLOS ONE, May 2013 | Volume 8 | Issue 5 | e649...",10.1371/journal.pone.0064904,2013,5,,1,,,...,1.0,http://doi.org/10.1371/journal.pone.0064904,PLOS,7,"[nan, nan, nan, nan, nan, nan, nan]","[[], [], [], [], [], [], []]",University of Nebraska Medical Center,19.0,,Department of Pharmacology and Experimental Ne...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,"Nathalie Delgehyr, Uta Wieland, He_lne Rangon...","Drosophila Mgr, a Prefoldin subunit cooperatin...","PNAS | April 10, 2012 | vol. 109 | no. 15 | 57...",10.1073/pnas.1108537109,2012,,,1,,,...,1.0,http://doi.org/10.1073/pnas.1108537109,PNAS,12,"[nan, 0.38095238095238093, nan, nan, nan, nan,...","[[], [British Journal of Dermatology , Journal...",Ecole Normale Supérieure de Paris,20.0,,Département de Biologie
210,"Rajesh K. Grover, Julong Cheng, Yingjie Peng, ...",The costimulatory immunogen LPS induces the B-...,"PNAS _ April 17, 2012 _ vol. 109 _ no. 16, 603...",10.1073/pnas.1202214109,2012,,,1,,,...,1.0,http://doi.org/10.1073/pnas.1202214109,PNAS,10,"[nan, nan, nan, 0.35365853658536583, nan, nan,...","[[], [], [], [], [], [], [Publications, Reads ...",Delhi State Cancer Institute,35.0,Dr,Clinical Oncology
211,"Kevin Y. Lee, Yuji Yamamoto, Jeremie Boucher, ...",Shox2 is a molecular determinant of depot-spec...,"PNAS | July 9, 2013 | vol. 110 | no. 28 | 1140...",10.1073/pnas.1310331110,2013,,,1,,,...,1.0,http://doi.org/10.1073/pnas.1310331110,PNAS,8,"[0.26153846153846155, 0.7142857142857143, nan,...","[[], [Proceedings of the National Academy of S...",Harvard University,17.0,PhD,
212,"Shuyu Ren, Bryce G. Johnson, Yujiro Kida, Coli...",LRP-6 is a coreceptor for multiple fibrogenic ...,"PNAS | January22,2013 | vol.110 | no.4 | 1441",10.1073/pnas.1211179110,2013,,,,1,,...,1.0,http://doi.org/10.1073/pnas.1211179110,PNAS,11,"[0.5714285714285714, nan, 0.16666666666666666,...","[[], [], [Biochemical and Biophysical Research...",University of Washington Seattle,6.0,,


In [26]:
# Export dataframe to CSV

bik_df.to_csv("Bik_pt4.csv")