In [1]:
import pandas as pd 
import pickle

train = pd.read_csv('../data/train.csv')

In [2]:
def downsample(df, pct_pos):
    ''' 
    Borrowed from earlier project: https://github.com/kelseymarkey/
    cook-county-mental-health-prediction/blob/master/Final_Data_Prep.py

    takes in df and a percentage from 1 to 50
    samples all label==1 cases, then samples from label==0 cases 
    until downsampled_df has pct_pos % positive cases, returns indices.
    '''
    # split into df by label
    label_1 = df[df['label'] == 1]
    label_0 = df[df['label'] == 0]

    #count number of pos
    count_label_1 = len(label_1)

    #compute number of negative cases to sample
    num_label_0 = count_label_1 * int(round((100 - pct_pos) / pct_pos))

    #sample from negative cases
    label_0_sample = label_0.sample(n=num_label_0, random_state=22)

    #append sampled negative cases to all positive cases
    downsampled_df = label_1.append(label_0_sample)

    return list(downsampled_df.index)

In [3]:
downsampled_idx_train = downsample(train, 50)

In [4]:
# Check that indices actually return right distribution of labels
downsampled_df = train[train.index.isin(downsampled_idx_train)]
downsampled_df.label.value_counts()

1    25819
0    25819
Name: label, dtype: int64

In [5]:
with open('../data/downsampled_idx_train.pckl', 'wb') as f:
    pickle.dump(downsampled_idx_train, f)