In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

# visualization
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

## Read and clean datasets

In [2]:
def clean_Cohen_datasets(path):
    """Read local raw datasets and clean them"""
    
    # read datasets
    df = pd.read_csv(path)
    
    # rename columns
    df.rename(columns={"abstracts":"abstract", "label1":"label_abstract_screening", "label2":"label_included"}, inplace=True)
    
    # recode inclusion indicators
    df.label_abstract_screening = np.where(df.label_abstract_screening == "I", 1, 0)
    df.label_included = np.where(df.label_included == "I", 1, 0)
    
    # add record id
    df.insert(0, "record_id", df.index + 1)
    
    return df

In [3]:
df_ACEInhibitors = clean_Cohen_datasets("raw/ACEInhibitors.csv")
df_ADHD = clean_Cohen_datasets("raw/ADHD.csv")
df_Antihistamines = clean_Cohen_datasets("raw/Antihistamines.csv")
df_AtypicalAntipsychotics = clean_Cohen_datasets("raw/AtypicalAntipsychotics.csv")
df_BetaBlockers = clean_Cohen_datasets("raw/BetaBlockers.csv")
df_CalciumChannelBlockers = clean_Cohen_datasets("raw/CalciumChannelBlockers.csv")
df_Estrogens = clean_Cohen_datasets("raw/Estrogens.csv")
df_NSAIDS = clean_Cohen_datasets("raw/NSAIDS.csv")
df_Opiods = clean_Cohen_datasets("raw/Opiods.csv")
df_OralHypoglycemics = clean_Cohen_datasets("raw/OralHypoglycemics.csv")
df_ProtonPumpInhibitors = clean_Cohen_datasets("raw/ProtonPumpInhibitors.csv")
df_SkeletalMuscleRelaxants = clean_Cohen_datasets("raw/SkeletalMuscleRelaxants.csv")
df_Statins = clean_Cohen_datasets("raw/Statins.csv")
df_Triptans = clean_Cohen_datasets("raw/Triptans.csv")
df_UrinaryIncontinence = clean_Cohen_datasets("raw/UrinaryIncontinence.csv")

## Export datasets

In [4]:
Path("output/local").mkdir(parents=True, exist_ok=True)
df_ACEInhibitors.to_csv("output/local/ACEInhibitors.csv", index=False)
df_ADHD.to_csv("output/local/ADHD.csv", index=False)
df_Antihistamines.to_csv("output/local/Antihistamines.csv", index=False)
df_AtypicalAntipsychotics.to_csv("output/local/AtypicalAntipsychotics.csv", index=False)
df_BetaBlockers.to_csv("output/local/BetaBlockers.csv", index=False)
df_CalciumChannelBlockers.to_csv("output/local/CalciumChannelBlockers.csv", index=False)
df_Estrogens.to_csv("output/local/Estrogens.csv", index=False)
df_NSAIDS.to_csv("output/local/NSAIDS.csv", index=False)
df_Opiods.to_csv("output/local/Opiods.csv", index=False)
df_OralHypoglycemics.to_csv("output/local/OralHypoglycemics.csv", index=False)
df_ProtonPumpInhibitors.to_csv("output/local/ProtonPumpInhibitors.csv", index=False)
df_SkeletalMuscleRelaxants.to_csv("output/local/SkeletalMuscleRelaxants.csv", index=False)
df_Statins.to_csv("output/local/Statins.csv", index=False)
df_Triptans.to_csv("output/local/Triptans.csv", index=False)
df_UrinaryIncontinence.to_csv("output/local/UrinaryIncontinence.csv", index=False)

## Dataset statistics

See `process_Cohen_datasets_online.ipynb`.