In [1]:
import pandas as pd

In [2]:
data_path = "data/RoB-data-w-uids.csv"
df = pd.read_csv(data_path, chunksize=10000)
df = pd.concat(df, ignore_index=True)

In [3]:
def get_duplicate_ids():
    '''
    Assemble data for testing
    '''
    duplicate_uids = []
    for idx, row in df.iterrows():
        cur_cdno = row['cdno']
        cur_uid = row['uid']
        # duplicates are studies w/same uid but in *different*
        # review
        cur_duplicates = df[(df['uid'] == cur_uid) & (df['cdno'] != cur_cdno)]
        if len(cur_duplicates) > 0:
            duplicate_uids.append(cur_uid)

    return list(set(duplicate_uids))


In [4]:
test_ids = get_duplicate_ids()

In [5]:
len(test_ids)

3112

In [6]:
formatted_df = pd.read_csv("data/RoB-data-formatted.csv", chunksize=10000)
formatted_df = pd.concat(formatted_df, ignore_index=True)

In [7]:
formatted_df.head()

Unnamed: 0.1,Unnamed: 0,ac-doc-judgment,ac-rationale,boa-doc-judgment-all,boa-doc-judgment-mortality,boa-doc-judgment-objective,boa-doc-judgment-subjective,boa-rationale-all,boa-rationale-mortality,boa-rationale-objective,...,bpp-rationale-all,bpp-rationale-mortality,bpp-rationale-objective,bpp-rationale-subjective,doc_id,doi,pmid,rsg-doc-judgment,rsg-rationale,sentence
0,0,low,0.0,unk,unk,low,low,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,17665470,10.1002/art.22892,17665470.0,low,0.0,Arthritis & Rheumatism (Arthritis Care & Resea...
1,1,low,0.0,unk,unk,low,low,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,17665470,10.1002/art.22892,17665470.0,low,0.0,57
2,2,low,0.0,unk,unk,low,low,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,17665470,10.1002/art.22892,17665470.0,low,0.0,"No. 6, August 15, 2007, pp 1027–1037 DOI 10.10..."
3,3,low,0.0,unk,unk,low,low,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,17665470,10.1002/art.22892,17665470.0,low,0.0,"© 2007, American College of Rheumatology\nORIG..."
4,4,low,0.0,unk,unk,low,low,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,17665470,10.1002/art.22892,17665470.0,low,0.0,A Randomized Trial\nRACHELLE


In [8]:
formatted_df[formatted_df["doc_id"].isnull()]

Unnamed: 0.1,Unnamed: 0,ac-doc-judgment,ac-rationale,boa-doc-judgment-all,boa-doc-judgment-mortality,boa-doc-judgment-objective,boa-doc-judgment-subjective,boa-rationale-all,boa-rationale-mortality,boa-rationale-objective,...,bpp-rationale-all,bpp-rationale-mortality,bpp-rationale-objective,bpp-rationale-subjective,doc_id,doi,pmid,rsg-doc-judgment,rsg-rationale,sentence
409051,/),,,,,,,,,,...,,,,,,,,,,
3182181,/),,,,,,,,,,...,,,,,,,,,,


In [12]:
formatted_df_no_nulls = formatted_df[pd.notnull(formatted_df["doc_id"])]
formatted_df_no_nulls['doc_id'].unique().shape

(24089,)

In [13]:
formatted_df['doc_id'].unique().shape

(24090,)

Finally, actually pull out the train and test instances

In [15]:
test_df = formatted_df_no_nulls[formatted_df_no_nulls['doc_id'].isin(test_ids)]
test_df.shape

(945568, 25)

In [67]:
test_df['doc_id'].unique().shape

(3112,)

In [66]:
test_df.to_csv("data/splits/test-df.csv", index=False)

In [18]:
train_df = formatted_df_no_nulls[~formatted_df_no_nulls['doc_id'].isin(test_ids)]
train_df.shape

(3081845, 25)

In [19]:
train_df['doc_id'].unique().shape

(20977,)

Now we deal with any weird edge cases wherein a study has multiple judgments for any given domain (despite being within the same Cochrane review, by construction)

In [38]:
doc_judgments = ['rsg-doc-judgment',
       'ac-doc-judgment',
       'bpp-doc-judgment-all',
       'bpp-doc-judgment-mortality',
       'bpp-doc-judgment-objective',
       'bpp-doc-judgment-subjective',
       'boa-doc-judgment-all',
       'boa-doc-judgment-objective',
       'boa-doc-judgment-subjective',
       'boa-doc-judgment-mortality']
             
def find_edge_case_UIDs():
    weird_uids = []
    docs = train_df.groupby("doc_id")
    documents = []
    for doc_id, doc in docs:
        doc_lbl_dict = {}
        for dj in doc_judgments:
            if not doc[dj].unique().shape[0]==1:
                print
                weird_uids.append(doc_id)
    return list(set(weird_uids))

In [39]:
edge_cases = find_edge_case_UIDs()

In [40]:
len(edge_cases)

62

In [54]:
train_df["doc_id"].unique().shape

(20977,)

In [53]:
list(zip(doc_judgments, [train_df[train_df["doc_id"] == edge_cases[1]][otype].unique() for otype in doc_judgments]))

[('rsg-doc-judgment', array(['unclear', 'low'], dtype=object)),
 ('ac-doc-judgment', array(['unclear', 'low'], dtype=object)),
 ('bpp-doc-judgment-all', array(['unclear', 'low'], dtype=object)),
 ('bpp-doc-judgment-mortality', array(['unk'], dtype=object)),
 ('bpp-doc-judgment-objective', array(['unk'], dtype=object)),
 ('bpp-doc-judgment-subjective', array(['unk'], dtype=object)),
 ('boa-doc-judgment-all', array(['unclear', 'low'], dtype=object)),
 ('boa-doc-judgment-objective', array(['unk'], dtype=object)),
 ('boa-doc-judgment-subjective', array(['unk'], dtype=object)),
 ('boa-doc-judgment-mortality', array(['unk'], dtype=object))]

Per consultation w/Iain, we'll just drop these weird ones. 

In [55]:
train_df_clean = train_df[~train_df['doc_id'].isin(edge_cases)]

In [56]:
train_df_clean["doc_id"].unique().shape

(20915,)

In [57]:
train_df["doc_id"].unique().shape

(20977,)

In [58]:
train_df = train_df_clean # just overwrite the noisier version

Randomly sample a development set.

In [60]:
train_uids = train_df["doc_id"].unique()

In [61]:
import numpy as np
dev_set_ids = np.random.choice(train_uids, 5000, replace=False)

In [62]:
dev_df = train_df[train_df['doc_id'].isin(dev_set_ids)]
train_sans_dev_df = train_df[~train_df['doc_id'].isin(dev_set_ids)]

In [64]:
dev_df["doc_id"].unique().shape

(5000,)

In [65]:
train_sans_dev_df["doc_id"].unique().shape

(15915,)

In [68]:
train_sans_dev_df.to_csv("data/splits/train-df.csv", index=False)

In [69]:
dev_df.to_csv("data/splits/dev-df.csv", index=False)

In [71]:
pwd

'/home/byron/RoB-2.0'