# Configure

In [35]:
# Import libraries for loading data, analysis and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from os.path import join
import os

In [32]:
# This gets you to your code directory
path = Path.cwd()
# This gets you to your project directory
ROOT_DIR = path.parent.absolute()
# Root for data directories
r_fr = join(ROOT_DIR, 'data', 'raw')
r_fi = join(ROOT_DIR, 'data', 'interim')
r_fp = join(ROOT_DIR, 'data', 'processed')


# Get list of journals for review

In [23]:
# Load the list of 50 journals
j_df = pd.read_csv(join(r_fi, 'JCR_SCIE_Filtered.csv'))

In [24]:
# First, we drop any journals that are mostly reviews,
# perspectives, comments, opinions
drop_col = j_df.columns[-1]
j_keep = j_df.loc[j_df[drop_col] != True]

In [25]:
# Next, we drop on dupliacte publichsers and keep first since
# the df is sorted from top to bottom by 2022 JIF
j_final = j_keep.drop_duplicates(subset='Publisher').reset_index(drop=True)

In [26]:
print('Number of Journals to Search: ' + str(len(j_final)))

Number of Journals to Search: 18


In [27]:
# Get ISSN and eISSN merged for search
j_final['IS'] = np.where(j_final['ISSN'].notnull(),
                         j_final['ISSN'],
                         j_final['eISSN'])

In [28]:
print('List of IS to use in searches:\n')
print(j_final['IS'])

List of IS to use in searches:

0     1754-5692
1     1758-678X
2     2542-5196
3     2590-3330
4     1610-3653
5     0034-4257
6     0043-1354
7     2524-7972
8     1354-1013
9     0013-936X
10    0091-6765
11    0301-4797
12    0013-9351
13    2328-4277
14    2662-4435
15    1674-9278
16    1726-2135
17    1001-0742
Name: IS, dtype: object


In [29]:
# Create a column of how many articles fit the search 
# in the sample after checking the search on WOS
# (IS=IS) AND (TS=clim*) AND ((PY=2021) OR (PY=2022))
# then subsetting to article types. We want to keep journals that
# have published at least 30 articles according to this search criteria
article_list = [27, 244, 64, 71, 10, 321, 125, 5, 770,
                298, 15, 808, 310, 332, 270, 119, 14, 34]
num_articles = pd.Series(article_list, name='n_articles')
j_final = j_final.assign(n_articles=num_articles)

In [31]:
# Drop the review,etc. column and write out the dataframe into processed
j_final.drop(columns=drop_col).to_csv(join(r_fp, 'journals_to_search.csv'))

# Process articles from journal searches

In [69]:
# Loop through the raw/articles/ directory
# to create a dataframe of the articles we need to check
# for open data and code
art_dir = join(r_fr, 'articles')

# For each file in this directory, pd.read_csv with
# tab delim. Add to list of dfs and concat at the end
df_list = []
for file in os.listdir(art_dir):
    # Skip dot files
    if file[0] != '.':
        filepath = join(art_dir, file)
        temp = pd.read_csv(filepath, sep='\t')
        df_list.append(temp)
        print(file)
articles = pd.concat(df_list, axis=0)

savedrecs_0013-936X.txt
savedrecs_0034-4257.txt
savedrecs_2662-4435.txt
savedrecs_1001-0742.txt
savedrecs_1354-1013.txt
savedrecs_1758-678X.txt
savedrecs_0013-9351.txt
savedrecs_0043-1354.txt
savedrecs_1674-9278.txt
savedrecs_2328-4277.txt
savedrecs_0301-4797.txt
savedrecs_2590-3330.txt
savedrecs_2542-5196.txt


In [79]:
# Subset to columns of interest for search
# columns are described here 
# http://webofscience.help.clarivate.com/en-us/Content/export-records.htm
# under All Export Field Tags
# We want to keep 
# AU: Authors or Inventors
# TI: Article Title
# SO: Source Title (Journal title)
# DT: Document Type
# TC: Times Cited Count
# DI: DOI
# DL: DOI Link
col_keep = ['AU', 'TI', 'SO', 'DT', 'TC', 'DI', 'DL', 'PY']
col_names = ['authors', 'title', 'journal', 'doc_type', 'total_cited',
             'doi', 'doi_link', 'pub_year']
articles_f = articles.loc[:, col_keep]
articles_f.columns = col_names

# Subset to articles with > 20 citations
articles_f = articles_f[articles_f['total_cited'] >= 20]

# Write out to interim
articles_f.to_csv(join(r_fi, 'articles.csv'))