In [12]:
import os
import pandas as pd
from risparser import convert_ris_to_csv
os.chdir("/automated-systematic-review-datasets/datasets/Van_de_Schoot_PTSD")

## Convert RIS files into CSV

In [2]:
raw_files = [raw_files for raw_files in os.listdir("raw") if raw_files.endswith(".ris")]
os.makedirs("csv", exist_ok=True)
for ris_file in raw_files :
    ris_fp = os.path.join("raw", ris_file)
    csv_fp = os.path.join("csv", os.path.splitext(ris_file)[0]+".csv")
    convert_ris_to_csv(ris_fp, csv_fp)

Input file: raw/schoot-lgmm-ptsd-initial.ris
Number of articles: 6187
Export file: csv/schoot-lgmm-ptsd-initial.csv
Input file: raw/schoot-lgmm-ptsd-included-1.ris
Number of articles: 363
Export file: csv/schoot-lgmm-ptsd-included-1.csv
Input file: raw/schoot-lgmm-ptsd-included-3.ris
Number of articles: 8
Export file: csv/schoot-lgmm-ptsd-included-3.csv
Input file: raw/schoot-lgmm-ptsd-included-2.ris
Number of articles: 38
Export file: csv/schoot-lgmm-ptsd-included-2.csv


## Clean the data 

(Code from the R file)

In [3]:
# original papers
os.chdir("csv")
all_data = pd.read_csv("schoot-lgmm-ptsd-initial.csv")

# included papers
inc_data = pd.read_csv("schoot-lgmm-ptsd-included-2.csv")

# After abstract screening
aas_data = pd.read_csv("schoot-lgmm-ptsd-included-1.csv")

# Directly included after reading the abstract
dir_data = pd.read_csv("schoot-lgmm-ptsd-included-3.csv")

# all titles (clean)
all_title = all_data["title"].str.replace("[^A-Za-z0-9]", "", regex=True)
# included titles (clean)
inc_title = inc_data["title"].str.replace("[^A-Za-z0-9]", "", regex=True)
# After abstract screening title
aas_title = aas_data["title"].str.replace("[^A-Za-z0-9]", "", regex=True)
# Directly included
dir_title = dir_data["title"].str.replace("[^A-Za-z0-9]", "", regex=True)

In [4]:
inc_missing = ~inc_title.str.lower().isin(all_title.str.lower())
aas_missing = ~aas_title.str.lower().isin(all_title.str.lower())

print("Papers included, missing from initial data:              ", inc_missing.sum(), "\n")
print("Papers in abstract screening, missing from initial data: ", aas_missing.sum(), "\n\n")

Papers included, missing from initial data:               0 

Papers in abstract screening, missing from initial data:  2 




In [5]:
# Add missing papers to original dataset.
all_data = all_data.append(aas_data[aas_missing], sort=True).reset_index(drop=True)
all_data = all_data.append(inc_data[inc_missing], sort=True).reset_index(drop=True)

# Update "cleaned" titles with new additions.
all_title = all_data["title"].str.replace("[^A-Za-z0-9]", "", regex=True)

In [6]:
# train data
label_inc = all_title.str.lower().isin(inc_title.str.lower())
label_aas = all_title.str.lower().isin(aas_title.str.lower())
label_dir = all_title.str.lower().isin(dir_title.str.lower())

CODE_EXCLUDE = 0      # Exclude paper.
CODE_AFT_EXCLUDE = 1  # Exclude after reading full text.
CODE_AFT_INCLUDE = 2  # Include after reading full text.
CODE_AAS_INCLUDE = 3  # Include after reading abstract.

inclusion_code = (label_aas & ~label_inc)*CODE_AFT_EXCLUDE
inclusion_code = inclusion_code + (label_inc & ~label_dir)*CODE_AFT_INCLUDE
inclusion_code = inclusion_code + label_dir*CODE_AAS_INCLUDE

train_data = all_data.assign(included = label_inc.astype(int), inclusion_code = inclusion_code)
train_data = train_data.assign(authors = train_data['authors'].str.replace("[\\[']", "", regex=True))
train_data = train_data.assign(authors = train_data['authors'].str.replace("\\]", "", regex=True))

# 64 with missing title
print("Number of papers with missing title:              ", all_title.isna().sum(), "\n")

# 762 with missing abstract
print("Number of papers with missing abstract:           ", all_data['abstract'].isna().sum(), "\n")

# 62 with both missing titles and abstracts
print("Number of papers with missing title AND abstract: ", (all_title.isna() & all_data['abstract'].isna()).sum(), "\n")

# 764 with either missing titles or abstracts
print("Number of papers with missing title OR abstract:  ", (all_title.isna() | all_data['abstract'].isna()).sum(), "\n\n")

Number of papers with missing title:               64 

Number of papers with missing abstract:            764 

Number of papers with missing title AND abstract:  62 

Number of papers with missing title OR abstract:   766 




In [7]:
# remove duplicates based on just titles
unique_train_data = train_data[(~all_title.str.lower().duplicated()) | all_title.str.lower().isnull()]

n_train = len(unique_train_data['included'])
n_inc = unique_train_data['included'].sum()
n_exc = n_train-n_inc

n_aas_exc = (unique_train_data['inclusion_code'] == 0).sum()
n_aft_exc = (unique_train_data['inclusion_code'] == CODE_AFT_EXCLUDE).sum()
n_aft_inc = (unique_train_data['inclusion_code'] == CODE_AFT_INCLUDE).sum()
n_aas_inc = (unique_train_data['inclusion_code'] == CODE_AAS_INCLUDE).sum()

print("Total remaining papers in training set:     ", n_train, "\n")
print("Total number of INCLUSIONS:                 ", n_inc, " (", round(100*n_inc/n_exc, 2), "% )\n")
print("Total number of EXCLUSIONS:                 ", n_exc, "\n\n")
print("Total EXCLUSIONS after abstract screening:  ", n_aas_exc, "\n")
print("Total EXCLUSIONS after full text screening: ", n_aft_exc, "\n")
print("Total INCLUSIONS after full text screening: ", n_aft_inc, "\n")
print("Total INCLUSIONS after abstract screening:  ", n_aas_inc, "\n\n")

Total remaining papers in training set:      5782 

Total number of INCLUSIONS:                  38  ( 0.66 % )

Total number of EXCLUSIONS:                  5744 


Total EXCLUSIONS after abstract screening:   5426 

Total EXCLUSIONS after full text screening:  318 

Total INCLUSIONS after full text screening:  30 

Total INCLUSIONS after abstract screening:   8 




In [9]:
unique_train_data.to_csv("PTSD_VandeSchoot_18.csv", index=False)
os.rename(os.path.join("/automated-systematic-review-datasets/datasets/Van_de_Schoot_PTSD/csv", "PTSD_VandeSchoot_18.csv"),
          os.path.join("/automated-systematic-review-datasets/datasets/Van_de_Schoot_PTSD/output", "PTSD_VandeSchoot_18.csv"))

## Statistics