# Extract Composite Labels

- Pull out labels and save them for loading later
- Filter away classes for which we have less than 10 training examples for (the case with the linear models)

In [11]:
%store -r abstracts_targets

dataset = abstracts_targets

columns = ['allocation',
           'endpoint_classification',
           'intervention_model',
           'masking',
           'primary_purpose',
           'gender',
           'healthy_volunteers',
           'phase']

dataset = dataset[columns]

dataset = dataset.reset_index(drop=True)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2093 entries, 0 to 2092
Data columns (total 8 columns):
allocation                 1746 non-null object
endpoint_classification    1882 non-null object
intervention_model         2085 non-null object
masking                    2092 non-null object
primary_purpose            2015 non-null object
gender                     2093 non-null object
healthy_volunteers         2092 non-null object
phase                      2093 non-null object
dtypes: object(8)
memory usage: 147.2+ KB


### Fixup Masking

In [12]:
dataset.groupby('masking').size()

masking
Double Blind (Caregiver Investigator Outcomes Assessor)               2
Double Blind (Caregiver Investigator)                                 2
Double Blind (Investigator Outcomes Assessor)                         6
Double Blind (Subject Caregiver Investigator Outcomes Assessor)     378
Double Blind (Subject Caregiver Investigator)                        84
Double Blind (Subject Caregiver Outcomes Assessor)                    5
Double Blind (Subject Caregiver)                                      7
Double Blind (Subject Investigator Outcomes Assessor)                91
Double Blind (Subject Investigator)                                 268
Double Blind (Subject Outcomes Assessor)                             35
Double-Blind                                                          7
Open Label                                                         1013
Single Blind                                                          1
Single Blind (Caregiver)                                

In [14]:
dataset.masking[dataset.masking.fillna('').str.startswith('Double Blind')] = 'Double Blind'
dataset.masking[dataset.masking.fillna('').str.startswith('Double-Blind')] = 'Double Blind'
dataset.masking[dataset.masking.fillna('').str.startswith('Single Blind')] = 'Single Blind'
    
dataset.groupby('masking').size()

masking
Double Blind     885
Open Label      1013
Single Blind     194
dtype: int64

### Filter Away Classes With $<$ 80 Examples

In [15]:
def filter_sparse_classes(df, columns, threshold=80):
    for column in columns:
        sizes = df.groupby(column).size()
        suitable_classes = sizes[sizes >= threshold].index
        bad_mask = df[column].isin(suitable_classes)
        df.loc[~bad_mask, column] = None
        
    return df

df = filter_sparse_classes(dataset, columns)

### Categorize Data

In [16]:
for column in columns:
    dataset[column] = dataset[column].astype('category')

for column in columns:
    print dataset.groupby(column).size()
    print

allocation
Non-Randomized     228
Randomized        1518
dtype: int64

endpoint_classification
Efficacy Study            642
Safety Study              106
Safety/Efficacy Study    1031
dtype: int64

intervention_model
Crossover Assignment        157
Parallel Assignment        1380
Single Group Assignment     501
dtype: int64

masking
Double Blind     885
Open Label      1013
Single Blind     194
dtype: int64

primary_purpose
Prevention     263
Treatment     1563
dtype: int64

gender
Both      1821
Female     186
Male        86
dtype: int64

healthy_volunteers
Accepts Healthy Volunteers     411
No                            1681
dtype: int64

phase
N/A                410
Phase 1            104
Phase 1/Phase 2     92
Phase 2            530
Phase 3            614
Phase 4            275
dtype: int64



### Binarize the Data

In [17]:
binarized_labels = dataset.copy()

for column in columns:
    binarized_labels[column] = binarized_labels[column].cat.codes

binarized_labels

Unnamed: 0,allocation,endpoint_classification,intervention_model,masking,primary_purpose,gender,healthy_volunteers,phase
0,1,0,1,0,1,0,1,4
1,1,0,-1,1,0,0,1,4
2,-1,2,2,1,1,0,1,3
3,1,0,1,0,1,0,1,5
4,1,2,1,0,1,0,1,3
5,1,0,1,1,1,0,1,3
6,1,2,1,1,1,0,1,1
7,-1,-1,2,1,1,1,1,1
8,1,2,1,1,1,1,1,4
9,-1,-1,2,1,1,1,1,3


In [18]:
binarized_labels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2093 entries, 0 to 2092
Data columns (total 8 columns):
allocation                 2093 non-null int8
endpoint_classification    2093 non-null int8
intervention_model         2093 non-null int8
masking                    2093 non-null int8
primary_purpose            2093 non-null int8
gender                     2093 non-null int8
healthy_volunteers         2093 non-null int8
phase                      2093 non-null int8
dtypes: int8(8)
memory usage: 32.7 KB


### Save It!

In [19]:
composite_labels = dataset
composite_binarized = binarized_labels

%store composite_labels
%store composite_binarized

import pickle
pickle.dump(composite_labels, open('composite_labels.p', 'wb'))
pickle.dump(composite_binarized, open('composite_binarized.p', 'wb'))

Stored 'composite_labels' (DataFrame)
Stored 'composite_binarized' (DataFrame)


In [21]:
%qtconsole