In [1]:
import pandas as pd 
import pickle
import numpy as np

train = pd.read_csv('../data/train.csv')

In [2]:
labels = train['label'].values

# Indicies of each class' observations
i_class0 = np.where(labels == 0)[0]
i_class1 = np.where(labels == 1)[0]

# Number of observations in each class
n_class0 = len(i_class0)
n_class1 = len(i_class1)
print('Original number of samples in class 0:', n_class0)
print('Original number of samples in class 1:', n_class1)

# For every observation in class 0, randomly sample from class 1 with replacement
np.random.seed(0)
i_class1_upsampled = np.random.choice(i_class1, size=n_class0, replace=True)

print('New number of samples in class 0:', len(i_class1_upsampled))
print(i_class1_upsampled)

# Join together class 1's upsampled target vector with class 0's target vector
upsampled_idx_train = np.concatenate((i_class1_upsampled, i_class0))
print('Total number of samples (upsampled class 0 + class 1):', len(upsampled_idx_train))

Original number of samples in class 0: 225055
Original number of samples in class 1: 25819
New number of samples in class 0: 225055
[ 27761 107310  97713 ...  68168 242169 108471]
Total number of samples (upsampled class 0 + class 1): 450110


In [3]:
# This makes sense- it got nearly all of the 25819 samples
len(np.unique(i_class1_upsampled))

25815

In [4]:
# Check that indices actually return right distribution of labels
train_up = train.loc[upsampled_idx_train]
train_up.label.value_counts()

1    225055
0    225055
Name: label, dtype: int64

In [5]:
with open('../data/idx_train.pckl', 'wb') as f:
    pickle.dump(upsampled_idx_train, f)