In [218]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import tika
import requests
import copy
import re

from bs4 import BeautifulSoup

from googlesearch import search
from tika import parser
from tika import detector
from tika import language

%matplotlib inline

def load_data(filename):
    
    df = pd.read_csv(filename, sep ='\t', header=0)
    print("Dataset size", df.shape)
    
    return df

# Classification Label (0-3) 
# 0:  simple duplications
# 1:  duplications with repositioning
# 2:  duplications with alteration 
# 3: Cuts & Beautification 

def explore_classification(dframe):
    
    print("Simple Duplication in ", dframe['0'].sum(), "papers")
    print("Duplications with Repositioning in ", dframe['1'].sum(), "papers")
    print("Duplications with alteration in ", dframe['2'].sum(), "papers")
    print("Cuts & Beautification in ", dframe['3'].sum(), "papers")
    print("How many papers reported? ", dframe['Reported'].sum(), "papers")
    
    return

def convert_datatype(dframe):
    
    # Ref: https://strftime.org/
    # Ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
    
    dframe["Day"] = dframe['Month'].str[0:2]
    dframe["Month"] = dframe['Month'].str[2:]
    dframe['Published Date'] = dframe['Year'].astype('str') + dframe['Month'] + dframe['Day']
    dframe['Published Date'] = pd.to_datetime(dframe['Published Date'], format='%Y%B%d', errors='coerce')
    dframe['3'] = pd.to_numeric(dframe['3'], errors = 'coerce')
    dframe['Reported'] = pd.to_numeric(dframe['Reported'], errors = 'coerce')
    dframe['Correction Date'] = pd.to_datetime(dframe['Correction Date'], errors = 'coerce')
    
    return dframe

# url name from DOI
def extract_url_using_requests(top_doi, dframe): 
    
    # Returns dict with DOI unique key, value is url of the paper
    
    out_frame = {}
    fail_frame = {}
    query_list = dframe['DOI'].to_list()
    
    for qlist in query_list:
        if 'pnas' not in qlist:
            try:           
                response = requests.get(url = top_doi+ qlist)
                soup = BeautifulSoup(response.content,  "html.parser")
                source_url  = soup.find(rel="canonical").get('href')
                out_frame[qlist] = source_url
            except:
                fail_frame[qlist] = ''
        else:
            fail_frame[qlist] = ''
            
    for key in out_frame.keys():
        if 'https' not in out_frame[key]:
            out_frame[key] = 'https:'+out_frame[key] 
            
    return out_frame, fail_frame


# parse content of url using Tika and get author details
def build_author_dictionary(url_frame): 
    
    # Returns dict with DOI unique key, values are author details

    author_dict = {}
    fail_author_dict = {}
    
    for key in url_frame.keys():

        author_detail_list = []
        doi_name = key
        url_name = url_frame[key]

        try:

            parsed = parser.from_file(url_name, xmlContent=True)

            first_author = parsed["metadata"]['citation_author'][0]
            co_authors = parsed["metadata"]['citation_author'][1:]
            if 'citation_author_institution' in parsed["metadata"]:
                author_institution = parsed["metadata"]['citation_author_institution'][0]
            else:
                author_institution = ''
            journal_title = parsed["metadata"]['citation_journal_title']
            publisher = parsed["metadata"]['citation_publisher']
            if 'twitter:image' in parsed["metadata"]:
                image_url = parsed["metadata"]['twitter:image']
            else:
                image_url = parsed["metadata"]['og:image']

            author_detail_list.append(first_author)
            author_detail_list.append(co_authors)
            author_detail_list.append(author_institution)
            author_detail_list.append(journal_title)
            author_detail_list.append(publisher)
            author_detail_list.append(image_url)

            author_dict[doi_name] = author_detail_list

        except:

            fail_author_dict[doi_name] = ''
            
    return author_dict, fail_author_dict

#parse researchgate using first author and extract no of publications

def build_author_pub_url(author_dict): 
    
    # Returns dict with DOI unique key and values with researchgate url and first author name
    
    pub_url_dict = {}
    fail_pub_url_dict = {}

    for key in author_dict.keys():
        author_pub_list = []
        doi_1 = key
        author_1 = author_dict[key][0]
        researchgate_url = 'https://www.researchgate.net/scientific-contributions/'+author_1

        try:        
            for j in search(researchgate_url, num = 1, stop = 1, pause = 2.0, user_agent=None, verify_ssl = True ):
                pub_url = j
            author_pub_list.append(author_1)
            author_pub_list.append(pub_url)
            pub_url_dict[doi_1] = author_pub_list

        except:
            fail_pub_url_dict[doi_1] = author_1
            
    return pub_url_dict, fail_pub_url_dict


def dict_to_df(data_dict):
    
    df = pd.DataFrame(data_dict, index =[0]).T.reset_index()
    
    return df

def getNumbers(str):
    
    array = re.findall(r'[0-9]+', str)
    
    return array

def extract_url_using_search(fail_frame_dict):
    
    fail_frame_search = {}
    fail_frame_repeat = {}
    
    for key in fail_frame_dict.keys():
        try:
            for j in search(key, num = 1, stop = 1, pause = 2.0, user_agent=None, verify_ssl = True ):
                print(j)
                fail_frame_search[key] = j
        except:
            fail_frame_repeat[key] = ''
    
    return fail_frame_search, fail_frame_repeat

In [198]:
data.columns

Index(['Authors', 'Title', 'Citation', 'DOI', 'Year', 'Month', '0', '1', '2',
       '3', 'FINDINGS', 'Reported', 'Correction Date', 'Retraction',
       'Correction', 'No Action', 'SUM  Completed', 'Day', 'Published Date'],
      dtype='object')

In [73]:
#main

data = load_data('Bik_dataset.tsv')
data = convert_datatype(data)
explore_classification(data)
print("------------------------------------------------------------")
print(data.columns)
print("------------------------------------------------------------")
top_doi = "http://www.doi.org/"
out_frame, fail_frame = extract_url_using_requests(top_doi, data)
print("Got url for ", len(out_frame), "DOI")
print("Could not get url for ", len(fail_frame), "DOI")
print("------------------------------------------------------------")
author_dict, fail_author_dict = build_author_dictionary(out_frame)
print("Got author details for ", len(author_dict), "DOI out of ", len(out_frame), "DOI")
print("Could not get author details for ", len(fail_author_dict), "DOI out of ", len(out_frame), "DOI")
print("------------------------------------------------------------")

Dataset size (214, 17)
Simple Duplication in  11.0 papers
Duplications with Repositioning in  72.0 papers
Duplications with alteration in  91.0 papers
Cuts & Beautification in  51.0 papers
How many papers reported?  212.0 papers
Index(['Authors', 'Title', 'Citation', 'DOI', 'Year', 'Month', '0', '1', '2',
       '3', 'FINDINGS', 'Reported', 'Correction Date', 'Retraction',
       'Correction', 'No Action', 'SUM  Completed', 'Day', 'Published Date'],
      dtype='object')


2022-02-26 17:02:21,694 [MainThread  ] [INFO ]  Retrieving https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0053510 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/plosone-article.
2022-02-26 17:02:22,220 [MainThread  ] [INFO ]  Retrieving https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0053940 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/plosone-article.
2022-02-26 17:02:22,675 [MainThread  ] [INFO ]  Retrieving https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0057285 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/plosone-article.
2022-02-26 17:02:23,268 [MainThread  ] [INFO ]  Retrieving https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0062170 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/plosone-article.
2022-02-26 17:02:23,794 [MainThread  ] [INFO ]  Retrieving https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0064904 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/plosone

2022-02-26 17:02:42,934 [MainThread  ] [INFO ]  Retrieving https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1003445 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/plospathogens-article.
2022-02-26 17:02:43,432 [MainThread  ] [INFO ]  Retrieving https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1003845 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/plospathogens-article.
2022-02-26 17:02:44,032 [MainThread  ] [INFO ]  Retrieving https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1003896 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/plospathogens-article.
2022-02-26 17:02:44,557 [MainThread  ] [INFO ]  Retrieving https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1004059 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/plospathogens-article.
2022-02-26 17:02:45,076 [MainThread  ] [INFO ]  Retrieving https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1003981 to /v

2022-02-26 17:03:23,652 [MainThread  ] [INFO ]  Retrieving https://www.nature.com/articles/nature12878 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/articles-nature12878.
2022-02-26 17:03:25,714 [MainThread  ] [INFO ]  Retrieving https://www.nature.com/articles/onc2013184 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/articles-onc2013184.
2022-02-26 17:03:27,699 [MainThread  ] [INFO ]  Retrieving https://www.nature.com/articles/onc2012182 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/articles-onc2012182.
2022-02-26 17:03:29,863 [MainThread  ] [INFO ]  Retrieving https://www.nature.com/articles/onc2013237 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/articles-onc2013237.
2022-02-26 17:03:31,842 [MainThread  ] [INFO ]  Retrieving https://www.nature.com/articles/onc2014404 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/articles-onc2014404.
2022-02-26 17:03:34,245 [MainThread  ] [INFO ]  Retrieving https://www.nature.com/articles/onc201422 to /var/folders/_8/_ck

In [194]:
def dict_to_df(data_dict):
    
    df = pd.DataFrame(data_dict, index =[0]).T.reset_index()
    
    return df

In [201]:
out_frame_df = dict_to_df(out_frame)
out_frame_df.rename( columns = {'index': 'DOI', 0: 'Paper_url'}, inplace = True)
fail_frame_df = dict_to_df(fail_frame)
fail_frame_df.rename( columns = {'index': 'DOI', 0: 'Paper_url'}, inplace = True)
fail_frame_df.to_csv('fail_frame_df.csv', index=False)

In [214]:
author_dict_df = pd.DataFrame(author_dict).T.reset_index()
author_dict_df.rename( columns = {'index': 'DOI', 0: 'First_author', 1: 'Co_authors', \
                                 2: 'First_author_affiliation_univ', \
                                 3: 'Journal_title', 4: 'Publisher', \
                                 5: 'Image_url'}, inplace = True)

#Merge out_frame_df and author_dict_df
out_frame_df = pd.merge(out_frame_df, author_dict_df, how='left', on='DOI')
out_frame_df.to_csv('out_frame_df.csv', index=False)

In [226]:
fail_frame_search, fail_frame_repeat = extract_url_using_search(fail_frame)

https://journals.asm.org/doi/10.1128/mBio.00275-11
https://journals.asm.org/doi/10.1128/mBio.00495-12
https://journals.asm.org/doi/10.1128/CVI.00252-13
https://pubmed.ncbi.nlm.nih.gov/8675338/
https://pubmed.ncbi.nlm.nih.gov/9864199/
https://journals.asm.org/doi/abs/10.1128/iai.68.1.72-79.2000
https://journals.asm.org/doi/10.1128/IAI.69.10.6131-6139.2001
https://journals.asm.org/doi/10.1128/iai.71.2.948-955.2003?permanently=true
https://journals.asm.org/doi/10.1128/IAI.71.2.766-773.2003
https://pubmed.ncbi.nlm.nih.gov/12595434/
https://journals.asm.org/doi/10.1128/IAI.72.3.1223-1229.2004
https://pubmed.ncbi.nlm.nih.gov/15731036/
https://journals.asm.org/doi/10.1128/IAI.73.3.1754-1763.2005
https://pubmed.ncbi.nlm.nih.gov/16428763/
https://journals.asm.org/doi/abs/10.1128/iai.01192-08
https://journals.asm.org/doi/10.1128/IAI.00913-08
https://journals.asm.org/doi/abs/10.1128/iai.00156-09
https://journals.asm.org/doi/abs/10.1128/IAI.00956-09
https://journals.asm.org/doi/10.1128/IAI.01203-1

In [231]:
fail_frame_author_dict, fail_frame_author_repeat_dict = build_author_dictionary(fail_frame_search)

2022-02-26 23:56:04,348 [MainThread  ] [INFO ]  Retrieving https://journals.asm.org/doi/10.1128/mBio.00275-11 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/doi-10.1128-mbio.00275-11.
2022-02-26 23:56:04,530 [MainThread  ] [INFO ]  Retrieving https://journals.asm.org/doi/10.1128/mBio.00495-12 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/doi-10.1128-mbio.00495-12.
2022-02-26 23:56:04,712 [MainThread  ] [INFO ]  Retrieving https://journals.asm.org/doi/10.1128/CVI.00252-13 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/doi-10.1128-cvi.00252-13.
2022-02-26 23:56:04,862 [MainThread  ] [INFO ]  Retrieving https://pubmed.ncbi.nlm.nih.gov/8675338/ to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/8675338.
2022-02-26 23:56:05,367 [MainThread  ] [INFO ]  Retrieving https://pubmed.ncbi.nlm.nih.gov/9864199/ to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/9864199.
2022-02-26 23:56:05,764 [MainThread  ] [INFO ]  Retrieving https://journals.asm.org/doi/abs/10.1128/iai.68.1.72-7

2022-02-26 23:56:14,180 [MainThread  ] [INFO ]  Retrieving https://onlinelibrary.wiley.com/doi/10.1002/ijc.27989 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/doi-10.1002-ijc.27989.
2022-02-26 23:56:14,353 [MainThread  ] [INFO ]  Retrieving https://onlinelibrary.wiley.com/doi/10.1002/ijc.28032 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/doi-10.1002-ijc.28032.
2022-02-26 23:56:14,502 [MainThread  ] [INFO ]  Retrieving https://pubmed.ncbi.nlm.nih.gov/23720015/ to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/23720015.
2022-02-26 23:56:14,995 [MainThread  ] [INFO ]  Retrieving https://onlinelibrary.wiley.com/doi/abs/10.1002/ijc.28001 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/doi-abs-10.1002-ijc.28001.
2022-02-26 23:56:15,175 [MainThread  ] [INFO ]  Retrieving https://pubmed.ncbi.nlm.nih.gov/23712470/ to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/23712470.
2022-02-26 23:56:15,499 [MainThread  ] [INFO ]  Retrieving https://pubmed.ncbi.nlm.nih.gov/22573407/ t

In [240]:
fail_frame_search_df = dict_to_df(fail_frame_search)
fail_frame_search_df.rename( columns = {'index': 'DOI', 0: 'Paper_url'}, inplace = True)
fail_frame_repeat_df = dict_to_df(fail_frame_repeat)
fail_frame_repeat_df.rename( columns = {'index': 'DOI', 0: 'Paper_url'}, inplace = True)
fail_frame_repeat_df.to_csv('fail_frame_repeat_df.csv', index=False)

In [241]:
fail_frame_author_dict_df = pd.DataFrame(fail_frame_author_dict).T.reset_index()
fail_frame_author_dict_df.rename( columns = {'index': 'DOI', 0: 'First_author', 1: 'Co_authors', \
                                 2: 'First_author_affiliation_univ', \
                                 3: 'Journal_title', 4: 'Publisher', \
                                 5: 'Image_url'}, inplace = True)

fail_frame_author_dict_df
#Merge fail_frame_search_df and fail_frame_author_dict_df
# fail_frame_search_df = pd.merge(fail_frame_search_df, fail_frame_author_dict_df, how='left', on='DOI')
# fail_frame_search_df.to_csv('fail_frame_search_df.csv', index=False)

Unnamed: 0,DOI,First_author,Co_authors,First_author_affiliation_univ,Journal_title,Publisher,Image_url
0,PMID: 8675338,C W Cutler,"[P I Eke, C A Genco, T E Van Dyke, R R Arnold]",D,Infection and immunity,Infect Immun,https://cdn.ncbi.nlm.nih.gov/pubmed/persistent...
1,PMID: 9864199,B A Wilson,"[V G Ponferrada, J E Vallance, M Ho]",D,Infection and immunity,Infect Immun,https://cdn.ncbi.nlm.nih.gov/pubmed/persistent...
2,10.1128/IAI.71.3.1209�1216.2003,Laura W Schrum,"[Ian Marriott, Betsy R Butler, Elaine K Thomas...",D,Infection and immunity,Infect Immun,https://cdn.ncbi.nlm.nih.gov/pubmed/persistent...
3,10.1128/IAI.73.3.1386-1398.2005,Peter L W Yun,"[Arthur A Decarlo, Cheryl C Chapple, Neil Hunter]",I,Infection and immunity,Infect Immun,https://cdn.ncbi.nlm.nih.gov/pubmed/persistent...
4,10.1128/IAI.74.2.1148-1155.2006,Ellen J Beswick,"[Irina V Pinchuk, Kyle Minch, Giovanni Suarez,...",D,Infection and immunity,Infect Immun,https://cdn.ncbi.nlm.nih.gov/pubmed/persistent...
5,10.1128/IAI.06230-11,Balamayooran Theivanthiran,"[Sanjay Batra, Gayathriy Balamayooran, Shansha...",L,Infection and immunity,Infect Immun,https://cdn.ncbi.nlm.nih.gov/pubmed/persistent...
6,10.1128/IAI.00063-12,Yun Sun,"[Wen-Jiang Zheng, Yong-Hua Hu, Bo-Guang Sun, L...",K,Infection and immunity,Infect Immun,https://cdn.ncbi.nlm.nih.gov/pubmed/persistent...
7,PMID: 9620397,N Zhi,"[N Ohashi, Y Rikihisa, H W Horowitz, G P Worms...",D,Journal of clinical microbiology,J Clin Microbiol,https://cdn.ncbi.nlm.nih.gov/pubmed/persistent...
8,10.1002/ijc.28289,Sujit K Bhutia,"[Swadesh K Das, Belal Azab, Mitchell E Menezes...",D,International journal of cancer,Int J Cancer,https://cdn.ncbi.nlm.nih.gov/pubmed/persistent...
9,10.1002/ijc.28280,Teresa Lorenzi,"[Maria Lorenzi, Emma Altobelli, Daniela Marzio...",D,International journal of cancer,Int J Cancer,https://cdn.ncbi.nlm.nih.gov/pubmed/persistent...


In [244]:
fail_frame_search

{'10.1128/mBio.00275-11': 'https://journals.asm.org/doi/10.1128/mBio.00275-11',
 '10.1128/mBio.00495-12': 'https://journals.asm.org/doi/10.1128/mBio.00495-12',
 '10.1128/mBio.00252-13': 'https://journals.asm.org/doi/10.1128/CVI.00252-13',
 'PMID: 8675338': 'https://pubmed.ncbi.nlm.nih.gov/8675338/',
 'PMID: 9864199': 'https://pubmed.ncbi.nlm.nih.gov/9864199/',
 '10.1128/IAI.68.1.72-79.2000': 'https://journals.asm.org/doi/abs/10.1128/iai.68.1.72-79.2000',
 '10.1128/IAI.69.10.6131�6139.2001': 'https://journals.asm.org/doi/10.1128/IAI.69.10.6131-6139.2001',
 '10.1128/IAI.71.2.948-955.2003': 'https://journals.asm.org/doi/10.1128/iai.71.2.948-955.2003?permanently=true',
 '10.1128/IAI.71.2.766�773.2003': 'https://journals.asm.org/doi/10.1128/IAI.71.2.766-773.2003',
 '10.1128/IAI.71.3.1209�1216.2003': 'https://pubmed.ncbi.nlm.nih.gov/12595434/',
 '10.1128/IAI.72.3.1223-1229.2004': 'https://journals.asm.org/doi/10.1128/IAI.72.3.1223-1229.2004',
 '10.1128/IAI.73.3.1386-1398.2005': 'https://pubm

In [None]:
fail_frame_author_dict_df['I']

In [258]:
parsed = parser.from_file('https://journals.asm.org/doi/10.1128/mBio.00275-11', xmlContent=True)
parsed

2022-02-27 00:43:19,580 [MainThread  ] [INFO ]  Retrieving https://journals.asm.org/doi/10.1128/mBio.00275-11 to /var/folders/_8/_ckgy86x1bn51h7fw8g1xtqm0000gn/T/doi-10.1128-mbio.00275-11.


HTTPError: HTTP Error 503: Service Temporarily Unavailable

In [110]:
pub_url = pd.read_csv('author_researchgate_url.csv', header=0)
print("Dataset size", pub_url.shape)
str1='https://www.researchgate.net/scientific-contributions/'
pub_url['author_url'].head(5)

Dataset size (45, 1)


0    https://www.researchgate.net/scientific-contri...
1    https://www.researchgate.net/scientific-contri...
2    https://www.researchgate.net/scientific-contri...
3    https://www.researchgate.net/scientific-contri...
4    https://www.researchgate.net/scientific-contri...
Name: author_url, dtype: object

In [148]:
pub_url.author_url.str[54:]

0     Inka-Regine-Weingaertner-2004447803
1              Jessica-M-Esparza-12186129
2           Sreedevi-Avasarala-2003066677
3            Rounak-Nassirpour-2066194593
4                    Liu-Jianguo-78199223
5                    Yang-Wang-2190596776
6         Magdalene-Papadopoulos-31365377
7              Milo-J-Aukerman-2108843098
8                       Jia-MA-2090248218
9                   Orit-Reish-2089649981
10                   Yan-ZHANG-2152498622
11                Zhongpu-Chen-2007426692
12                    2129773850-Fang-Liu
13               Jagannath-Misra-58954791
14                     N-G-Fowler-3362179
15                  Zhenni-Zhang-71321492
16              Matthias-Groszer-48055760
17                      2046271046-S-Flis
18                 Yume-Nohara-2042875504
19                   J-Stiefel-2044813130
20                    Chuan-Li-2102931660
21              Gail-Burnaford-2012225415
22               Prabuddha-Dey-2121895329
23                  Xiaomei-Ge-211

In [219]:
pub_url.shape

(45, 2)

In [220]:
pub_url['research works'] =''
pub_url['citations'] =''
pub_url['reads'] =''
for i in range(pub_url.shape[0]):
    print(pub_url['author_url'][i])
    try:
        
        response = requests.get(url = pub_url['author_url'][i])
        soup = BeautifulSoup(response.content,  "html.parser")
        str1_desc = soup.find("meta", property="og:description").get("content")
        array = getNumbers(str1_desc)
        pub_url['research works'][i] = array[0]
        pub_url['citations'][i] = array[1]
        pub_url['reads'][i] = array[2]
        
    except:
        
        print('Error with ', pub_url['author_url'][i] )

https://www.researchgate.net/scientific-contributions/Inka-Regine-Weingaertner-2004447803
https://www.researchgate.net/scientific-contributions/Jessica-M-Esparza-12186129
https://www.researchgate.net/scientific-contributions/Sreedevi-Avasarala-2003066677
https://www.researchgate.net/scientific-contributions/Rounak-Nassirpour-2066194593
https://www.researchgate.net/scientific-contributions/Liu-Jianguo-78199223
Error with  https://www.researchgate.net/scientific-contributions/Liu-Jianguo-78199223
https://www.researchgate.net/scientific-contributions/Yang-Wang-2190596776
Error with  https://www.researchgate.net/scientific-contributions/Yang-Wang-2190596776
https://www.researchgate.net/scientific-contributions/Magdalene-Papadopoulos-31365377
https://www.researchgate.net/scientific-contributions/Milo-J-Aukerman-2108843098
https://www.researchgate.net/scientific-contributions/Jia-MA-2090248218
https://www.researchgate.net/scientific-contributions/Orit-Reish-2089649981
Error with  https://www

In [254]:
pub_url

Unnamed: 0,author_url,research works,citations,reads
0,https://www.researchgate.net/scientific-contri...,3.0,12.0,326.0
1,https://www.researchgate.net/scientific-contri...,13.0,189.0,656.0
2,https://www.researchgate.net/scientific-contri...,24.0,759.0,3.0
3,https://www.researchgate.net/scientific-contri...,9.0,349.0,1.0
4,https://www.researchgate.net/scientific-contri...,5.0,4.0,38.0
5,https://www.researchgate.net/scientific-contri...,,,
6,https://www.researchgate.net/scientific-contri...,4.0,149.0,123.0
7,https://www.researchgate.net/scientific-contri...,10.0,2.0,779.0
8,https://www.researchgate.net/scientific-contri...,5.0,2.0,46.0
9,https://www.researchgate.net/scientific-contri...,4.0,38.0,217.0


In [261]:
# parsed = parser.from_file(pub_url['author_url'][0], xmlContent=True)
response = requests.get(url = pub_url['author_url'][12],\
                       headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
                       AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
soup = BeautifulSoup(response.content,  "html.parser")
# print(soup)
str1_desc = soup.find("meta", property="og:description").get("content")
array = getNumbers(str1_desc)
print(str1_desc)
print(array)
pub_url['research works'][12] = array[0]
pub_url['citations'][12] = array[1]
pub_url['reads'][12] = array[2]

AttributeError: 'NoneType' object has no attribute 'get'

In [85]:
print(len(out_frame), len(fail_frame))
print(len(author_dict), len(fail_author_dict))
pub_url_dict, fail_pub_url_dict = build_author_pub_url(author_dict)
print("Got author publication from researchgate for ", len(pub_url_dict), "DOI out of ", len(author_dict), "DOI")
print("Could not get author publication from researchgate for ", \
      len(fail_pub_url_dict), "DOI out of ", len(author_dict), "DOI")
print("------------------------------------------------------------")

89 125
86 3
Got author publication from researchgate for  0 DOI out of  86 DOI
Could not get author publication from researchgate for  86 DOI out of  86 DOI
------------------------------------------------------------
