## Import relevant libraries

In [5]:
import numpy as np
import pandas as pd 
from sklearn import preprocessing

## Extract the data from the csv   

In [6]:
# i'll use the sklearn capabilites for stndardizing the inputs
data = pd.read_csv('Audiobooks_data.csv')

raw_csv_data = np.loadtxt('Audiobooks_data.csv',delimiter=',')
# excludng the ID and the targets 
unscaled_inputs_all = raw_csv_data[:,1:-1]
target_all = raw_csv_data[:,-1]

data.columns = ['id','book length avg','book length','price avg','price overall','review', 'review 10/10','Minutes listend','completion','support_request', 'last_visited minus purchase date', 'targets']
data.to_csv("datasets.csv", index=False)
data

Unnamed: 0,id,book length avg,book length,price avg,price overall,review,review 10/10,Minutes listend,completion,support_request,last_visited minus purchase date,targets
0,1143,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,0,0
1,2059,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,388,0
2,2882,1620.0,1620,5.96,5.96,0,8.91,0.42,680.4,1,129,0
3,3342,2160.0,2160,5.33,5.33,0,8.91,0.22,475.2,0,361,0
4,3416,2160.0,2160,4.61,4.61,0,8.91,0.00,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
14078,28220,1620.0,1620,5.33,5.33,1,9.00,0.61,988.2,0,4,0
14079,28671,1080.0,1080,6.55,6.55,1,6.00,0.29,313.2,0,29,0
14080,31134,2160.0,2160,6.14,6.14,0,8.91,0.00,0.0,0,0,0
14081,32832,1620.0,1620,5.33,5.33,1,8.00,0.38,615.6,0,90,0


## Balancing the dataset 

In [7]:

# 1. we will count the number of targets that are 1
# we will keep as many 0 as 1 ( we will delete the others )

# if we sum all the targets we willget the number of targets that are 1 
num_one_targets = int(np.sum(target_all))


zero_targets_counter = 0
indices_to_remove = []

# iterate over the dataset and balance it 
# if the target at position i is 0, and the number of zeros is bigger than the number of 1, 
# i'll know the indices of all data points to be removed
for i in range( target_all.shape[0] ):
    if target_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter >  num_one_targets:
        # append() is a method (adds) an object to a list
            indices_to_remove.append(i)

# np.delete(array, obj to delete, axis) is a method that deletes an object along an axis
# delete indices from input 
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0)
# delete indices from output
targets_equal_priors = np.delete(target_all, indices_to_remove, axis=0)


## Standardize the inputs 

In [9]:

# standardize or scale the inputs  
# preprocessing,scale(X) is a method that standardizes an array along an axis 
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

## Shuffle the data  

In [10]:
# A little trick is to shuffle the inputs and the targets. 
# we keep the same information but in a random order 

# pb data set was collected in the order of date
# Since we will be batching, we actually must shuffle the data

# np.arange([start],stop) is a method that returns a evenly spaced values within a given interval   
# np.shuffle 

shuffled_indices = np.arange(scaled_inputs.shape[0])

print(" before shuffle :", shuffled_indices)
np.random.shuffle(shuffled_indices)

print(" after shuffle  :" , shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

 before shuffle : [   0    1    2 ... 4471 4472 4473]
 after shuffle  : [1081 2959 2774 ... 2746  254 2778]


## split the dataset into train, validation and test  

In [11]:
samples_count = shuffled_inputs.shape[0]

print(samples_count)

train_sample_count = int(0.8*samples_count)
validation_sample_count = int(0.1*samples_count)
test_sample_count = samples_count - train_sample_count -validation_sample_count

# let's extract them from the dataset 
# train 
train_inputs = shuffled_inputs[:train_sample_count]
train_targets = shuffled_targets[:train_sample_count]

# validation
validation_inputs = shuffled_inputs[ train_sample_count:train_sample_count + validation_sample_count ]
validation_targets = shuffled_targets[ train_sample_count:train_sample_count + validation_sample_count ]

# test 
test_inputs = shuffled_inputs[ train_sample_count+validation_sample_count:]
test_targets = shuffled_targets[ train_sample_count+validation_sample_count:]

# All three sets are balanced

# print(np.sum(train_targets),train_sample_count,np.sum(train_targets)/train_sample_count)
# print(np.sum(validation_targets),validation_sample_count,np.sum(validation_targets)/validation_sample_count)
# print(np.sum(test_targets),test_sample_count,np.sum(test_targets)/test_sample_count)


4474


## Save the three datasets *.npz 

In [6]:

np.savez('Audiobooks_data_train', input= train_inputs, target=train_targets)
np.savez('Audiobooks_data_validation', input= validation_inputs, target=validation_targets)
np.savez('Audiobooks_data_test', input= test_inputs, target=test_targets)

array([[ 0.21053387, -0.18888517, -0.39082475, ...,  4.3345311 ,
        -0.20183481,  0.55225174],
       [-0.64419501, -0.67316726, -0.39082475, ..., -0.41569922,
        -0.20183481, -0.80255852],
       [-0.64419501, -0.67316726, -0.26579083, ..., -0.41569922,
        -0.20183481, -0.80255852],
       ...,
       [-0.64419501, -0.67316726, -0.39082475, ..., -0.37699364,
        -0.20183481, -0.75382433],
       [ 0.31737498,  1.7482432 , -0.39082475, ..., -0.41569922,
        -0.20183481, -0.80255852],
       [-0.55872213,  4.89607679, -0.09140143, ..., -0.41569922,
        -0.20183481, -0.78306484]])