In [1]:
from inspect import getsourcefile
import litstudy #Use pip install git+https://github.com/NLeSC/litstudy to download dev version. Other encoding problem when loading ris files (load_ris_file needs to use robust_open instead of open)
import os
import pandas as pd
from pathlib import Path
import re
import shutil

#Info: tags from WoS https://images.webofknowledge.com/images/help/WOS/hs_wos_fieldtags.html

#Set up project directory structure
rootdir = os.path.dirname(os.getcwd())
scopdir = Path(rootdir, "data", "scoping")
resdir = Path(rootdir, "results")

#Subdirectory where tab-delimited files of wos outputs were saved
scoping_2_datdir = Path(scopdir, 'scoping_2_wos')


In [2]:
#Get list of every tab-delimited file
tab_initlist = [p for p in list(scoping_2_datdir.glob('*')) 
                if re.compile(".*savedrecs\\([0-9]+\\)[.]txt").match(str(p))]
#Read as panda
wos_initlist = pd.concat((pd.read_csv(f, sep='\t') for f in tab_initlist), ignore_index=True) 


In [3]:
#Divide wos categories (which are all initially contained in a single column) into multiple columns
wos_cat_split = wos_initlist.WC.str.split(pat=re.compile("[;]|[&]"),expand=True)
wos_cat_split.columns = [f'wos_cat{i}' for i in range(wos_cat_split.shape[1])]
wos_initlist_cats = pd.concat([wos_initlist, wos_cat_split], axis=1)

#Randomly sample 10 articles in list for each wos category (each document can have multiple wos category)
#https://stackoverflow.com/questions/67871215/groupby-sample-pandas-with-keeping-the-groups-lower-than-n-if-applicable
catsamplecsv_path = Path(scopdir, 'wos_search1_catsample.csv')
if not catsamplecsv_path.exists():
    wos_initlist_catsample = wos_initlist_cats.sample(frac=1).groupby('wos_cat1').head(10) #To sample even if less than 10 in category
    wos_initlist_catsample.to_csv()
    
#######Base on 'wos_search1_catsample.csv', go through the title and abstract of every reference and include or exclude
#based on preliminary eligibility criteria.

In [4]:
#Compute the number of included and excluded references depending on all catgeories for each document
#Format as a table to be examined
wos_initlist_catsample_format = pd.read_csv(catsamplecsv_path)

wos_catsample_stats = pd.melt(frame=wos_initlist_catsample_format,
        id_vars=['Unnamed: 0', 'Inclusion'],
        value_vars = wos_cat_split.columns,
        #var_name='wos_catnum',
        value_name = 'wos_catnum'
        )

wos_catsample_stats['wos_catnum'] = wos_catsample_stats['wos_catnum'].replace(r"^ +| +$", r"", regex=True)

wos_catsample_stats = wos_catsample_stats.groupby(['wos_catnum', 'Inclusion']).size().to_frame('N').reset_index().\
    pivot(index='wos_catnum', columns='Inclusion')

wos_catsample_stats.columns = wos_catsample_stats.columns.droplevel()
wos_catsample_stats.reset_index(inplace=True)
wos_catsample_stats = wos_catsample_stats.rename(columns={0:"excluded", 1:"included"}).\
    fillna(0)
wos_catsample_stats['N'] = wos_catsample_stats['excluded'] + wos_catsample_stats['included']
wos_catsample_stats.sort_values(['included','N'], ascending=[False, False]).\
to_csv(Path(scopdir, 'wos_search1_catsample_stats.csv'))