In [1]:
import numpy as np
import pandas as pd
import scipy

from IPython.display import clear_output

import sys
sys.path.append('../../../../Documents/GitHub/gustav/src/')

from gustav import ebi, ncbi, nlm, biogrid, nih, openalex
from gustav import publications
from gustav import github
from gustav import access_framework
from gustav import mapper

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append('../general/src/')
from manuscript import export
from manuscript import inout
from manuscript import datasets
from manuscript import tools

from sklearn.metrics import auc
from scipy.stats import fisher_exact
pd.options.display.precision = 3
pd.options.display.expand_frame_repr = False
pd.options.display.max_columns = 20

import gc
import re
import json

In [2]:
import requests
import random
from bs4 import BeautifulSoup
import json
import re
import time

In [61]:
from bs4 import BeautifulSoup
import lxml
import os
import gc
from datetime import datetime

# All of Hindawi download on April 4, 2024

In [4]:
with open('../data/allofhindawi/2024/HINDAWI.ACISC/6671359-2024-01-31.xml', encoding='utf8') as f:
    text = f.read()

In [21]:
import os
all_of_hindawi = os.listdir(r"C:/Users/richa/Documents/GitHub/paper_mill_screen/data/allofhindawi/2008")

In [57]:
allofhindawi = []
for year in range(2008, 2025):
    year = str(year)
    journals = os.listdir(r"C:/Users/richa/Documents/GitHub/paper_mill_screen/data/allofhindawi/" + year)
    for journal in journals:
        items = os.listdir(r"C:/Users/richa/Documents/GitHub/paper_mill_screen/data/allofhindawi/" + year +'/'+ journal  + '/')
        for item in items:
            full_filename = year +'/'+ journal  + '/' + item
            allofhindawi.append(full_filename)

In [68]:
%%time
content_type_array = []
contrib_df_array = []
handling_time_array = []
aff_array = []
subj_array = []
filename_array = []

count = 0
for filename in allofhindawi[count:]:
    
    filepath = '../data/allofhindawi/' + filename
    
    with open(filepath, encoding='utf8') as f:
        text = f.read()
        
    file = BeautifulSoup(text, 'lxml')
    
    # doi
    try:
        doi = file.find('article-id', attrs={'pub-id-type':'doi'}).text
    except:
        doi = ''
    
    try:
        filename_array.append(pd.DataFrame({'doi':[doi], 'filename':[filename]}))
    except:
        pass
    
    # content type
    try:
        content_type = file.find('subject').text
        content_type_array.append(pd.DataFrame({'doi':[doi], 'content_type':[content_type]}))
    except:
        pass
    
    # contrib
    for contrib in file.find_all('contrib'):
        try:
            contrib_type = contrib['contrib-type']
            if contrib.find('name'):
                if contrib.find('name').find('surname'):
                    surname = contrib.find('name').find('surname').text
                else:
                    surname = ''
                if contrib.find('name').find('given-names'):
                    given_name = contrib.find('name').find('given-names').text
                else:
                    given_name = ''
            if contrib.find('contrib-id', {'contrib-id-type':'orcid'}):
                orcid = contrib.find('contrib-id', {'contrib-id-type':'orcid'}).text
            else:
                orcid = ''
                
            contrib_df_array.append(pd.DataFrame({'doi':[doi], 'contrib':[given_name + ' ' + surname], 
                                                  'contrib_type':[contrib_type], 'orcid':[orcid]}))
            for aff_id in contrib.find_all('xref', {'ref-type':'aff'}):
                rid = aff_id['rid']
                contrib_df_array.append(pd.DataFrame({'doi':[doi], 'contrib':[given_name + ' ' + surname], 
                                                      'contrib_type':[contrib_type], 'orcid':[orcid], 'aff_id':[rid]}))
        except:  
            pass
    
    # affiliations
    for aff in file.find_all('aff'):
        try:
            aff_id = aff['id']
            #address = aff.find('addr-line').text
            address_lines = aff.find_all('addr-line', text=True)
            address = ''
            for address_line in address_lines:
                address = address + address_line.text + ', '
            if aff.find('country'):
                address = address + aff.find('country').text
            aff_array.append(pd.DataFrame({'doi':[doi], 'aff_id':[aff_id], 'address':[address]}))
        except:
            pass
    
    # handling dates   
    for date in file.find_all('date'):
        try:
            date_type = date['date-type']
            day = date.find('day').text
            month = date.find('month').text
            year = date.find('year').text
            handling_time_array.append(pd.DataFrame({'doi':[doi], 'date_type':[date_type], 'day':[day], 'month':[month], 'year':[year]}))
        except:
            pass
        
    for date in file.find_all('pub-date', {'pub-type':'epub'}):
        try:
            date_type = 'Published'
            day = date.find('day').text
            month = date.find('month').text
            year = date.find('year').text
            handling_time_array.append(pd.DataFrame({'doi':[doi], 'date_type':[date_type], 'day':[day], 'month':[month], 'year':[year]}))
        except:
            pass
    
    # subjects
    for subj in file.find_all('subject'):
        try:
            subject = subj.text
            subj_array.append(pd.DataFrame({'doi':[doi], 'subject':[subject]}))
        except:
            pass
        
    file.decompose()
    del file
    count += 1
    if count % 1000 == 0:
        clear_output()
        print(count)
        gc.collect()
        with open('../data/230404_hindawi_datetime.txt', 'a+') as f:
            now = datetime.now()
            f.write(now.strftime("%H:%M:%S") + '\n')
            
        if len(handling_time_array) > 0:
            handling_time_df = pd.concat(handling_time_array)
            handling_time_df.to_csv('../data/240404_hindawi_handling_time_' + str(count) + '.csv', index=False)
        if len(contrib_df_array) > 0:
            contrib_df = pd.concat(contrib_df_array)
            contrib_df.to_csv('../data/240404_hindawi_contributors_' + str(count) + '.csv', index=False)
        if len(content_type_array) > 0:
            content_type_df = pd.concat(content_type_array)
            content_type_df.to_csv('../data/240404_hindawi_content_type_' + str(count) + '.csv', index=False)
        if len(aff_array) > 0:
            aff_df = pd.concat(aff_array)
            aff_df.to_csv('../data/240404_hindawi_aff_' + str(count) + '.csv', index=False)
        if len(subj_array) > 0:
            subj_df = pd.concat(subj_array)
            subj_df.to_csv('../data/240404_hindawi_subj_' + str(count) + '.csv', index=False)
        if len(filename_array) > 0:
            filename_df = pd.concat(filename_array)
            filename_df.to_csv('../data/240404_hindawi_filenames_' + str(count) + '.csv', index=False)
            
        content_type_array = []
        contrib_df_array = []
        handling_time_array = []
        aff_array = []
        subj_array = []
        filename_array = []  
            
if len(handling_time_array) > 0:
    handling_time_df = pd.concat(handling_time_array)
    handling_time_df.to_csv('../data/240404_hindawi_handling_time_' + str(count) + '.csv', index=False)
if len(contrib_df_array) > 0:
    contrib_df = pd.concat(contrib_df_array)
    contrib_df.to_csv('../data/240404_hindawi_contributors_' + str(count) + '.csv', index=False)
if len(content_type_array) > 0:
    content_type_df = pd.concat(content_type_array)
    content_type_df.to_csv('../data/240404_hindawi_content_type_' + str(count) + '.csv', index=False)
if len(aff_array) > 0:
        aff_df = pd.concat(aff_array)
        aff_df.to_csv('../data/240404_hindawi_aff_' + str(count) + '.csv', index=False)
if len(subj_array) > 0:
    subj_df = pd.concat(subj_array)
    subj_df.to_csv('../data/240404_hindawi_subj_' + str(count) + '.csv', index=False)
if len(filename_array) > 0:
    filename_df = pd.concat(filename_array)
    filename_df.to_csv('../data/240404_hindawi_filenames_' + str(count) + '.csv', index=False)
            
content_type_array = []
contrib_df_array = []
handling_time_array = []
aff_array = []
subj_array = []
filename_array = []

334000
CPU times: total: 4h 39min 4s
Wall time: 5h 2min 44s


In [63]:
file.find('article-id', attrs={'pub-id-type':'doi'})

<article-id pub-id-type="doi">10.1155/2012/102423</article-id>

In [76]:
len(allofhindawi)

334000

In [77]:
count

334000