<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Read-data" data-toc-modified-id="Read-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Read data</a></span></li><li><span><a href="#Balance-data-(equalize-priors)" data-toc-modified-id="Balance-data-(equalize-priors)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Balance data (equalize priors)</a></span></li><li><span><a href="#Standardize-input" data-toc-modified-id="Standardize-input-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Standardize input</a></span></li><li><span><a href="#Shuffle-data" data-toc-modified-id="Shuffle-data-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Shuffle data</a></span></li><li><span><a href="#Split-data-(train,-validation,-test)" data-toc-modified-id="Split-data-(train,-validation,-test)-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Split data (train, validation, test)</a></span></li><li><span><a href="#Save-as-.npz" data-toc-modified-id="Save-as-.npz-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Save as .npz</a></span></li></ul></div>

# Imports

In [1]:
import numpy as np
from sklearn import preprocessing

# Read data

In [2]:
fname = '../data/csv/Audiobooks_data.csv'
data = np.loadtxt(fname, delimiter=',')

X = data[:, 1:-1]
y = data[:, -1]

# Balance data (equalize priors)

In [3]:
num_ones_y = int(np.sum(y))

zero_counts_y = 0
idx_remove = []

for i in range(y.shape[0]):
    if y[i] == 0:
        zero_counts_y += 1
        if zero_counts_y > num_ones_y:
            idx_remove.append(i)
            
X = np.delete(X, idx_remove, axis=0)
y = np.delete(y, idx_remove, axis=0)

# Standardize input

In [4]:
X = preprocessing.scale(X)

# Shuffle data

In [5]:
idx = np.arange(X.shape[0])
np.random.shuffle(idx)

X = X[idx]
y = y[idx]

# Split data (train, validation, test)

In [6]:
N = X.shape[0]

n_train = int(0.8 * N)
n_valid = int(0.1 * N)
n_test = N - n_train - n_valid

X_train = X[:n_train]
y_train = y[:n_train]

X_valid = X[n_train: n_train+n_valid]
y_valid = y[n_train: n_train+n_valid]

X_test = X[n_train+n_valid:]
y_test = y[n_train+n_valid:]

print('Targets (y)')
print('Counts  sum  ratio')
print(n_train, np.sum(y_train), np.sum(y_train/n_train))
print(n_valid, np.sum(y_valid), np.sum(y_valid/n_valid))
print(n_test, np.sum(y_valid), np.sum(y_test/n_test))

Targets (y)
Counts  sum  ratio
3579 1783.0 0.49818385023749645
447 218.0 0.4876957494407158
448 218.0 0.5267857142857142


# Save as .npz

In [7]:
np.savez('../data/outputs/Audiobooks_data_train',      inputs=X_train, targets=y_train)
np.savez('../data/outputs/Audiobooks_data_validation', inputs=X_valid, targets=y_valid)
np.savez('../data/outputs/Audiobooks_data_test',       inputs=X_test,  targets=y_test)