In [1]:
from pathlib import Path

import pandas as pd
import requests
from RISparser import read, TAG_KEY_MAPPING, LIST_TYPE_TAGS

## Read files from Zenodo

In [2]:
url_included = "https://zenodo.org/record/3625931/files/DOKU_All%20Included_20200116_cap.txt"
url_abstract_screening = "https://zenodo.org/record/3625931/files/DOKU_All%20FT-Screening_20200116_cap.txt"
url_all = "https://zenodo.org/record/3625931/files/DOKU_All%20TiAb-Screening_20200116_cap.txt"

In [3]:
list_keys = [TAG_KEY_MAPPING[k] for k in LIST_TYPE_TAGS]

def read_ris_to_df(url):
    """Read RIS and return pandas DataFrame"""
    
    # download data and split into lines
    r = requests.get(url)
    r.encoding = 'utf-8-sig'
    lines = r.text.split('\r\n')
    
    # merge the field with multiple values
    items = []
    for item in read(lines):
        for k, v in item.items():
            if k in list_keys and item[k] is not None:
                item[k] = ';'.join(item[k])
        items.append(item)

    return pd.DataFrame(items) 

In [4]:
df_all = read_ris_to_df(url_all)
df_abstract_screening = read_ris_to_df(url_abstract_screening)
df_included = read_ris_to_df(url_included)

## Merge datasets

In [5]:
# Add label
df_included['label_included'] = 1
df_included['label_abstract_screening'] = 1

df_abstract_screening['label_included'] = 0
df_abstract_screening['label_abstract_screening'] = 1

df_all['label_included'] = 0
df_all['label_abstract_screening'] = 0

In [6]:
df_merged = df_included.append(df_abstract_screening, sort=False).append(df_all, sort=False)
df_merged.drop_duplicates(subset=['title', 'authors'], inplace=True)

In [7]:
# reorder columns (nothing special)
def order_columns(df):
    
    cols = list(df)
    label_cols = ['label_included', 'label_abstract_screening']
    df = df[list(set(cols) - set(label_cols)) + label_cols]
    return df[list(set(cols) - set(label_cols)) + label_cols]

df_merged = order_columns(df_merged)

## Export datasets

In [8]:
Path("output").mkdir(parents=True, exist_ok=True)
df_merged.to_csv("output/output_csv_wilson.csv", index=False)

## Dataset statistics

In [9]:
df_merged[['label_abstract_screening', 'label_included']].sum()

label_abstract_screening    174
label_included               26
dtype: int64