In [1]:
import os
import pandas as pd

In [2]:
# read in files in literature folder
literature_files = os.listdir('literature')
# keep only csv files and those that start with 'lit_'
literature_files = [file for file in literature_files if file.endswith('.csv') and file.startswith('lit_')]

# ask for user input to select csv files to read
print('Select csv files to read')
for i, file in enumerate(literature_files):
    print(f'{i}: {file}')
selected_files = input('Enter the number of the files separated by commas: ')
selected_files = selected_files.split(',')
selected_files = [int(i) for i in selected_files]

# read in selected files, have a new column that indicates the source of the data
data = []
for i in selected_files:
    file = literature_files[i]
    df = pd.read_csv(f'literature/{file}')
    # the source is the second part of the file name
    source = file.split('_')[1].split('.')[0]
    df['source'] = source
    data.append(df)

data = pd.concat(data)
print('Data read successfully')

Select csv files to read
0: lit_WOK_2021-07-01_2024-05-31.csv
1: lit_pubmed_2021-07-01_2024-05-31.csv
2: lit_WOS_2021-07-01_2024-05-31.csv
3: lit_acm_2021-07-01_2024-05-31.csv
Data read successfully


In [3]:
# print number of rows for each source
print('Number of rows for each source')
print(data['source'].value_counts())

Number of rows for each source
source
acm       3651
WOK       1262
pubmed     336
Name: count, dtype: int64


In [4]:
print(data.shape)   
data.head()

(5249, 13)


Unnamed: 0,Title,Authors,Publication Type,SourceTitle,Publication Year,Keywords,UT,DOI,ISSN,source,Abstract,PMID,URL
0,Unravelling the molecular dimensions of atmosp...,"Nazeer, Nazim; Bhargava, Arpit; Soni, Nikita; ...",Article,PHYSICS AND CHEMISTRY OF THE EARTH,2024.0,"Air pollution, Artificial intelligence, Enviro...",WOS,10.1016/j.pce.2024.103604,1474-7065,WOK,,,
1,Simultaneous thermal zoning and demand control...,"Rodriguez, Jose; Fumo, Nelson",Article,ENERGY REPORTS,2024.0,"Thermal zoning, Demand control ventilation, Es...",WOS,10.1016/j.egyr.2024.04.025,2352-4847,WOK,,,
2,Exploring the influence of indoor environment ...,"Ma, Chuan; Guerra-Santin, Olivia; Mohammadi, Masi",Article,BUILDING AND ENVIRONMENT,2024.0,"Nursing home, Indoor environment, Spatial layo...",WOS,10.1016/j.buildenv.2024.111452,0360-1323,WOK,,,
3,Radon Exposure Assessment in Occupational and ...,"Kholopo, Mota; Rathebe, Phoka Caiphus",Article,SENSORS,2024.0,"radon, environmental exposure, occupational se...",WOS,10.3390/s24102966,,WOK,,,
4,Semi-supervised ensemble learning for human ac...,"Patricia, Ariza-Colpas Paola; Rosberg, Pacheco...",Article,HELIYON,2024.0,"Human activity recognition, Activities of dail...",WOS,10.1016/j.heliyon.2024.e29398,,WOK,,,


In [5]:
# in Publication Type, get unique values
data['Publication Type'].unique()

array(['Article', 'Proceedings Paper', 'Review', 'Journal Article', nan,
       'Editorial Material', 'Meeting Abstract', 'Letter', 'Case Reports',
       'Patent', 'research-article', 'Journal Article, Review',
       "Journal Article, Research Support, Non-U.S. Gov't",
       'Journal Article, Multicenter Study, Observational Study',
       'Journal Article, Review, Systematic Review',
       'Journal Article, Randomized Controlled Trial',
       'Clinical Trial Protocol, Journal Article',
       'Case Reports, Journal Article',
       "Journal Article, Research Support, N.I.H., Extramural, Research Support, Non-U.S. Gov't, Research Support, U.S. Gov't, Non-P.H.S., Research Support, U.S. Gov't, P.H.S.",
       "Journal Article, Randomized Controlled Trial, Research Support, Non-U.S. Gov't",
       "Journal Article, Research Support, N.I.H., Extramural, Research Support, U.S. Gov't, Non-P.H.S.",
       "Journal Article, Observational Study, Research Support, N.I.H., Extramural, Resear

Rename publication types to consolidate.

In [6]:
# 'article', 'Article' and 'Journal Article' are the same
data['Publication Type'] = data['Publication Type'].replace('Journal Article', 'Article')
data['Publication Type'] = data['Publication Type'].replace('article', 'Article')
data['Publication Type'] = data['Publication Type'].replace('research-article', 'Article')

# proceedings, inproceedings, Proceedings Paper are the same
data['Publication Type'] = data['Publication Type'].replace('Proceedings Paper', 'Proceedings')
data['Publication Type'] = data['Publication Type'].replace('inproceedings', 'Proceedings')
data['Publication Type'] = data['Publication Type'].replace('proceedings', 'Proceedings')

# book and inbook are the same
data['Publication Type'] = data['Publication Type'].replace('inbook', 'Book')
data['Publication Type'] = data['Publication Type'].replace('book', 'Book')

data['Publication Type Other'] = data['Publication Type'].apply(lambda x: ','.join(x.split(',')[1:]) if isinstance(x, str) else '')
data['Publication Type'] = data['Publication Type'].apply(lambda x: x.split(',')[0] if isinstance(x, str) else '')

The entries where the Publication Type is blank are found in Web of Science - DIIDW database, which is the Derwent Innovations Index -- patents. Also remove types that are obviously not a study/paper -- Clinical Trial Protocol, Research Support (a review paper), Patent.

In [7]:
# remove rows that have the UT = DIIDW
data = data[data['UT'] != 'DIIDW']
# remove rows with Clinical Trial Protocol, Research Support, Patent in Publication Type
data = data[~data['Publication Type'].isin(['Clinical Trial Protocol', 'Research Support', 'Patent'])]
print(data['Publication Type'].unique())
print(data.shape)

['Article' 'Proceedings' 'Review' 'Editorial Material' 'Meeting Abstract'
 'Letter' 'Case Reports' 'Journal Article' 'Systematic Review' 'Book']
(5028, 14)


EC1: Works that are survey/review papers.

In [8]:
# remove rows that are Review, Systematic Review
data = data[~data['Publication Type'].isin(['Review', 'Systematic Review'])]
print(data.shape)

(4938, 14)


In [9]:
# search the string in Title whther it contains 'review' but not 'reviews'
data['is_review'] = data['Title'].apply(lambda x: 'review' in x.lower() and 'reviews' not in x.lower())
print(data['is_review'].value_counts())

is_review
False    4879
True       59
Name: count, dtype: int64


Checked through those marked True and they were all literature papers so they can be removed.

In [10]:
# drop rows that are reviews
data = data[~data['is_review']]
# drop the is_review column
data = data.drop(columns=['is_review'])
print(data.shape)

(4879, 14)


EC5: Works relating specifically to COVID since these works might not be representative of the normal (non-COVID) situations.

In [11]:
# search through title for the word covid or coronavirus
data['is_covid'] = data['Title'].apply(lambda x: 'covid' in x.lower() or 'coronavirus' in x.lower())
print(data['is_covid'].value_counts())
# drop rows that are covid related
data = data[~data['is_covid']]
# drop the is_covid column
data = data.drop(columns=['is_covid'])
print(data.shape)

is_covid
False    4718
True      161
Name: count, dtype: int64
(4718, 14)


In [13]:
# are there duplicate titles?
print(data['Title'].duplicated().sum())
# mark duplicates and the original
data['is_duplicate'] = data['Title'].duplicated(keep=False)
print(data['is_duplicate'].value_counts())

17
is_duplicate
False    4684
True       34
Name: count, dtype: int64


In [14]:
# if the duplicate is from the same source, keep the first one
data = data[~(data['is_duplicate'] & data.duplicated(subset=['Title', 'source']))]
# if the duplicate is from different sources, keep the one that is not from WOK
data = data[~(data['is_duplicate'] & (data['source'] == 'WOK'))]
data['is_duplicate'] = data['Title'].duplicated(keep=False)
print(data['is_duplicate'].value_counts())
# if there are no more duplicates, drop the is_duplicate column
if not data['is_duplicate'].any():
    data = data.drop(columns=['is_duplicate'])
print(data.shape)

is_duplicate
False    4701
Name: count, dtype: int64
(4701, 14)


In [16]:
# print the number of rows for each source
print(data['source'].value_counts())

source
acm       3567
WOK        827
pubmed     307
Name: count, dtype: int64


In [15]:
data.to_csv('literature/wb2_set.csv', index=False)