In [7]:
import numpy as np
import pandas as pd
import scipy

from IPython.display import clear_output

import sys
sys.path.append('../../../../Documents/GitHub/gustav/src/')

from gustav import ebi, ncbi, nlm, biogrid, nih, openalex
from gustav import publications
from gustav import github
from gustav import access_framework
from gustav import mapper

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append('../general/src/')
from manuscript import export
from manuscript import inout
from manuscript import datasets
from manuscript import tools

from sklearn.metrics import auc
from scipy.stats import fisher_exact
pd.options.display.precision = 3
pd.options.display.expand_frame_repr = False
pd.options.display.max_columns = 20

import re

# #allofplos
Downloaded Nov 8 2023
https://plos.org/text-and-data-mining/

In [8]:
from bs4 import BeautifulSoup
import lxml
import os
import gc

In [17]:
import os
all_of_plos = os.listdir(r"C:/Users/richa/Documents/GitHub/paper_mill_screen/data/allofplos")

In [19]:
len(all_of_plos)

345710

In [36]:
%%time
content_type_array = []
contrib_df_array = []
handling_time_array = []
aff_array = []
subj_array = []

count = 0
for filename in all_of_plos[count:]:
    
    filepath = '../data/allofplos/' + filename
    
    with open(filepath, encoding='utf8') as f:
        text = f.read()
        
    file = BeautifulSoup(text, 'lxml')
    
    # doi
    doi = file.find('article-id', attrs={'pub-id-type':'doi'}).text
    
    # content type
    content_type = file.find('subject').text
    content_type_array.append(pd.DataFrame({'doi':[doi], 'content_type':[content_type]}))
    
    # contrib
    for contrib in file.find_all('contrib'):
        try:
            contrib_type = contrib['contrib-type']
            if contrib.find('name'):
                if contrib.find('name').find('surname'):
                    surname = contrib.find('name').find('surname').text
                else:
                    surname = ''
                if contrib.find('name').find('given-names'):
                    given_name = contrib.find('name').find('given-names').text
                else:
                    given_name = ''
            if contrib.find('contrib-id', {'contrib-id-type':'orcid'}):
                orcid = contrib.find('contrib-id', {'contrib-id-type':'orcid'}).text
            else:
                orcid = ''
                
            contrib_df_array.append(pd.DataFrame({'doi':[doi], 'contrib':[given_name + ' ' + surname], 
                                                  'contrib_type':[contrib_type], 'orcid':[orcid]}))
            for aff_id in contrib.find_all('xref', {'ref-type':'aff'}):
                rid = aff_id['rid']
                contrib_df_array.append(pd.DataFrame({'doi':[doi], 'contrib':[given_name + ' ' + surname], 
                                                      'contrib_type':[contrib_type], 'orcid':[orcid], 'aff_id':[rid]}))
        except:  
            pass
    
    # affiliations
    for aff in file.find_all('aff'):
        try:
            aff_id = aff['id']
            address = aff.find('addr-line').text
            aff_array.append(pd.DataFrame({'doi':[doi], 'aff_id':[aff_id], 'address':[address]}))
        except:
            pass
    
    # handling dates   
    for date in file.find_all('date'):
        try:
            date_type = date['date-type']
            day = date.find('day').text
            month = date.find('month').text
            year = date.find('year').text
            handling_time_array.append(pd.DataFrame({'doi':[doi], 'date_type':[date_type], 'day':[day], 'month':[month], 'year':[year]}))
        except:
            pass
        
    for date in file.find_all('pub-date', {'pub-type':'epub'}):
        try:
            date_type = 'Published'
            day = date.find('day').text
            month = date.find('month').text
            year = date.find('year').text
            handling_time_array.append(pd.DataFrame({'doi':[doi], 'date_type':[date_type], 'day':[day], 'month':[month], 'year':[year]}))
        except:
            pass
    
    # subjects
    for subj in file.find_all('subject'):
        try:
            subject = subj.text
            subj_array.append(pd.DataFrame({'doi':[doi], 'subject':[subject]}))
        except:
            pass
        
    file.decompose()
    del file
    count += 1
    if count % 1000 == 0:
        clear_output()
        print(count)
        gc.collect()
        with open('../data/230325_plos_datetime.txt', 'a+') as f:
            now = datetime.now()
            f.write(now.strftime("%H:%M:%S") + '\n')
            
        if len(handling_time_array) > 0:
            handling_time_df = pd.concat(handling_time_array)
            handling_time_df.to_csv('../data/231109_plos_handling_time_' + str(count) + '.csv', index=False)
        if len(contrib_df_array) > 0:
            contrib_df = pd.concat(contrib_df_array)
            contrib_df.to_csv('../data/231109_plos_contributors_' + str(count) + '.csv', index=False)
        if len(content_type_array) > 0:
            content_type_df = pd.concat(content_type_array)
            content_type_df.to_csv('../data/231109_plos_content_type_' + str(count) + '.csv', index=False)
        if len(aff_array) > 0:
            aff_df = pd.concat(aff_array)
            aff_df.to_csv('../data/231109_plos_aff_' + str(count) + '.csv', index=False)
        if len(subj_df) > 0:
            subj_df = pd.concat(subj_array)
            subj_df.to_csv('../data/231109_plos_subj_' + str(count) + '.csv', index=False)
            
        content_type_array = []
        contrib_df_array = []
        handling_time_array = []
        aff_array = []
        subj_array = []
        
        
            
if len(handling_time_array) > 0:
    handling_time_df = pd.concat(handling_time_array)
    handling_time_df.to_csv('../data/231109_plos_handling_time_' + str(count) + '.csv', index=False)
if len(contrib_df_array) > 0:
    contrib_df = pd.concat(contrib_df_array)
    contrib_df.to_csv('../data/231109_plos_contributors_' + str(count) + '.csv', index=False)
if len(content_type_array) > 0:
    content_type_df = pd.concat(content_type_array)
    content_type_df.to_csv('../data/231109_plos_content_type_' + str(count) + '.csv', index=False)
if len(aff_array) > 0:
        aff_df = pd.concat(aff_array)
        aff_df.to_csv('../data/231109_plos_aff_' + str(count) + '.csv', index=False)
if len(subj_df) > 0:
    subj_df = pd.concat(subj_array)
    subj_df.to_csv('../data/231109_plos_subj_' + str(count) + '.csv', index=False)
            
content_type_array = []
contrib_df_array = []
handling_time_array = []
aff_array = []
subj_array = []

345000
CPU times: total: 13.5 s
Wall time: 15.7 s
