In [1]:
from collections import defaultdict
from inspect import getsourcefile
import litstudy #Use pip install git+https://github.com/NLeSC/litstudy to download dev version. Other encoding problem when loading ris files (load_ris_file needs to use robust_open instead of open)
import os
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import re
import shutil
from pyzotero import zotero

#Info: tags from WoS https://images.webofknowledge.com/images/help/WOS/hs_wos_fieldtags.html

#Set up project directory structure
rootdir = os.path.dirname(os.getcwd())
scopdir = Path(rootdir, "data", "scoping")
resdir = Path(rootdir, "results")
srcdir = Path(rootdir, 'src')

#Subdirectory where tab-delimited files of wos outputs were saved
scoping_3_datdir = Path(scopdir, 'scoping_3_wos')

#Pickle were loaded data will be saved
scoping_3_docset_pickle = Path(scoping_3_datdir, 'scoping_3_docset.pickle') 

#csv table where references' title, journal, year and DOI are written
reflist_csv = Path(scoping_3_datdir, 'scoping_3_shortmetadata.csv')

ModuleNotFoundError: No module named 'pyzotero'

In [157]:
#~~~~~~~~~~~~~ Read and compile references from the WoS search ~~~~~~~~~~~~~~~~~~
#into a single document set (lit_study format)

if not scoping_3_docset_pickle.exists():
    #Get list of every bib file
    bib_initlist = [p for p in list(scoping_3_datdir.glob('*')) 
                    if re.compile(".*savedrecs(\([0-9]{1,2}\))*[.]bib").match(str(p))]
    #Read bib files from first scoping and join them (takes ~15-20 sec/1000 refs)
    reflist = []
    for bib in bib_initlist:
        reflist += litstudy.load_bibtex(bib)
    
    #Pickle them (save the full document set as a binary file on disk that can be easily retrieved)
    with open(scoping_3_docset_pickle, 'wb') as f:
        pickle.dump(reflist, f)
else:
    #Read pre-saved document set
    with open(scoping_3_docset_pickle, 'rb') as f:
        reflist = pickle.load(f)

1000

In [158]:
#~~~~~~~~~~~~~ Get titles and DOIs from test list ~~~~~~~~~~~~~~~
api_key = Path(srcdir, 'zotero_key.txt').read_text().strip()
zot = zotero.Zotero(library_id = '4842799', library_type = 'group', api_key = api_key)
testlist_colID = str([col['key'] for col in zot.collections_top()
                      if col['data']['name'] == 'test list'][0])
testlist_items = zot.everything(zot.collection_items_top(testlist_colID))

testlist_title_dois = defaultdict(list)
for ref in testlist_items:
    testlist_title_dois[ref['key']].append(ref['data']['title'])
    if 'DOI' in ref['data']:
        testlist_title_dois[ref['key']].append(ref['data']['DOI'])
    else:
        testlist_title_dois[ref['key']].append(np.nan)

In [159]:
#~~~~~~~~~~~~~~ Write basic metadata from reference list to csv ~~~~~~~~~~~~~~~~~~

#Get all dois and titles in references returned from search
if not reflist_csv.exists():
    reflist_dict = {}
    for i, ref in enumerate(reflist):
        reflist_dict[i] = [ref.title.replace('\n', ' ').lower(), ref.publication_source, ref.publication_year]
        if 'doi' in ref.entry:
            reflist_dict[i].append(ref.entry['doi'])
        else:
            reflist_dict[i].append(np.nan)

    reflist_pd = pd.DataFrame.from_dict(reflist_dict, orient='index')
    reflist_pd.columns = ['title', 'source', 'year', 'doi']
    
    reflist_pd.to_csv(reflist_csv)
else:
    reflist_pd = pd.read_csv(reflist_csv)

In [160]:
#~~~~~~~~~~~~~~ Check which items in test list were retrieved through the search ~~~~~~~
returned = {}
not_returned = {}

for k, v in testlist_title_dois.items():
    if (v[0].lower() in set(reflist_pd.title)) or (v[1] in set(reflist_pd.doi)):
        returned[k] = v[0]
    else:
        not_returned[k] = v[0]

In [163]:
not_returned

{'NQE4IR8E': 'Development of recommended flow targets to support biological integrity based on regional flow-ecology relationships for benthic macroinvertebrates in Southern California streams',
 '6D7DKZTE': 'Flow recommendations for the tributaries of the Great Lakes in New York and Pennsylvania',
 'BD4MS6ET': 'Application of the Instream Flow Incremental Methodology to conservation flow for freshwater fishes in Japan',
 'MRZD6NI7': 'Environmental flow requirements of the Brisbane River downstream from Wivenhoe Dam',
 'VA65CY5L': 'Responses of biofilms to cyclic releases during a low flow period in the Mitta Mitta River, Victoria, Australia',
 'FZ78KPH2': 'A preliminary assessment of the relationship between angling quality and flow on the Lower Malmesbury Avon',
 'E5GD6SBW': 'Evaluation of instream flow methodologies for fisheries in Nebraska',
 '5ABFNPLW': 'Effects of experimental ramping rate of invertebrate community of a regulated river',
 '78R3U4TV': 'Evaluating fine sediment mo

In [None]:
# - Compare results to test list 
# - Sample X% of returns to assess specificity and sensitivity with preliminary eligibility criteria 
# - (for scopus and OpenAlex): for each retrieved article, identify articles with search terms that retrieved them. THis will allow us to remove search terms that are 100% redundant and to identify those that lead to low specificity 
# - Individually investigate why article from test list was not retrieved so that the search strategy can be adjusted 
# - Go through 100 articles for each database simply to test sensitivity of each database (because OpenAlex, WoS, Scopus and GS rely on such different algorithms and searching strategies) and refine strings if needed for each. Each one of us will screen 75 of these articles for each database with 50 articles in common for each database. 

In [None]:
"""
#Extra stuff for ris
#Remove total times cited and cited reference count lines, as well as line breaks within tags
#for each citation because it crashes litstudy
ris_initlist_edit = []
for ris in ris_initlist:
    ris_edit = Path(f"{ris.with_suffix('')}_edit.ris")
    ris_initlist_edit.append(ris_edit)
    if not ris_edit.exists():
        with open(ris, 'r', encoding='utf-8') as f_in:
            ris_txt = f_in.read()
        ris_txt_edit = re.sub('(Total Times Cited:\\s\\s[0-9]*\nCited Reference Count:\\s\\s[0-9]*\n)|(\n\\s)',
                              '', ris_txt)   
        
        with open(ris_edit, 'w', encoding='utf-8') as f_out:
            f_out.write(ris_txt_edit)
"""