#AudioBook Business Case (Predicting purchases) using TensorFlow

Predicting purchases of customers based on past data

###Importing Libraries

In [0]:
import numpy as np
from sklearn import preprocessing

###Importing the data

In [0]:
raw_csv_data = np.loadtxt('data/Audiobooks_data.csv', delimiter = ',')
# Exclude the first column which is the Customer-ID and the last column - target 
unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

##Preprocessing

###Balancing the dataset

Targets are the last column (binary) in which 1s have bought a product in a specified amount of time and 0s havent.  
So, in this file, there are way more 0s than 1s in the target. We will keep  only the same amount of 0 rows as there are 1 rows so as to balance the data

In [0]:
number_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
  if targets_all[i] == 0:
    zero_targets_counter += 1
    if zero_targets_counter > number_one_targets:
      indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis = 0)

###Standardize the imputs

In [0]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

###Shuffle the data

In [0]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

###Split data into train, validation and test

In [0]:
samples_count = shuffled_inputs.shape[0]

train_samples_count = int(samples_count*0.8)
validation_samples_count = int(samples_count*0.1)
test_samples_count = samples_count - train_samples_count - validation_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count: train_samples_count + validation_samples_count]
validation_targets = shuffled_targets[train_samples_count: train_samples_count + validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count + validation_samples_count: ]
test_targets = shuffled_targets[train_samples_count + validation_samples_count:]


In [0]:
#Printing the count of rows and targets split in each dataset
print("Training: Total rows - {0: .2f}, Total ones - {1:.2f}, Split Percent - {2:.2f}" .format(train_samples_count, np.sum(train_targets), np.sum(train_targets)/train_samples_count))
print("Validation: Total rows - {0: .2f}, Total ones - {1:.2f}, Split Percent - {2:.2f}" .format(validation_samples_count, np.sum(validation_targets), np.sum(validation_targets)/validation_samples_count))
print("Testing: Total rows - {0: .2f}, Total ones - {1:.2f}, Split Percent - {2:.2f}" .format(test_samples_count, np.sum(test_targets), np.sum(test_targets)/test_samples_count))

Training: Total rows -  3579.00, Total ones - 1794.00, Split Percent - 0.50
Validation: Total rows -  447.00, Total ones - 220.00, Split Percent - 0.49
Testing: Total rows -  448.00, Total ones - 223.00, Split Percent - 0.50


###Saving in .npz format

In [0]:
np.savez('Audiobooks_data_train', inputs = train_inputs, targets = train_targets)
np.savez('Audiobooks_data_validation', inputs = validation_inputs, targets = validation_targets)
np.savez('Audiobooks_data_test', inputs = test_inputs, targets = test_targets)

##Creating the Model

###Importing tensorflow for machine learning algorithm 

In [0]:
import tensorflow as tf

###Importing data from .npz files

In [0]:
npz = np.load('/content/Audiobooks_data_train.npz')
train_inputs = npz['inputs'].astype(np.float)
train_targets = npz['targets'].astype(np.int)

npz = np.load('/content/Audiobooks_data_validation.npz')
validation_inputs = npz['inputs'].astype(np.float)
validation_targets = npz['targets'].astype(np.int)

npz = np.load('/content/Audiobooks_data_test.npz')
test_inputs = npz['inputs'].astype(np.float)
test_targets = npz['targets'].astype(np.int)


###Customizing the Model

In [0]:
input_size = 10
output_size = 2
hidden_layer_size = 50

model = tf.keras.Sequential([
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(output_size, activation='softmax')
                            ])

model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

BATCH_SIZE = 100
MAX_EPOCHS = 100

early_stopping = tf.keras.callbacks.EarlyStopping(patience=3)

model.fit(train_inputs,
          train_targets,
          batch_size = BATCH_SIZE,
          epochs = MAX_EPOCHS,
          callbacks = [early_stopping],
          validation_data = (validation_inputs,validation_targets),
          verbose=2
          )

Epoch 1/100
36/36 - 0s - loss: 0.5400 - accuracy: 0.7823 - val_loss: 0.4394 - val_accuracy: 0.8523
Epoch 2/100
36/36 - 0s - loss: 0.3593 - accuracy: 0.8762 - val_loss: 0.3453 - val_accuracy: 0.8725
Epoch 3/100
36/36 - 0s - loss: 0.3075 - accuracy: 0.8857 - val_loss: 0.3278 - val_accuracy: 0.8814
Epoch 4/100
36/36 - 0s - loss: 0.2895 - accuracy: 0.8910 - val_loss: 0.3115 - val_accuracy: 0.8814
Epoch 5/100
36/36 - 0s - loss: 0.2790 - accuracy: 0.8938 - val_loss: 0.3049 - val_accuracy: 0.8837
Epoch 6/100
36/36 - 0s - loss: 0.2702 - accuracy: 0.9000 - val_loss: 0.2878 - val_accuracy: 0.8881
Epoch 7/100
36/36 - 0s - loss: 0.2619 - accuracy: 0.9011 - val_loss: 0.2892 - val_accuracy: 0.8881
Epoch 8/100
36/36 - 0s - loss: 0.2561 - accuracy: 0.9036 - val_loss: 0.2858 - val_accuracy: 0.8926
Epoch 9/100
36/36 - 0s - loss: 0.2542 - accuracy: 0.9028 - val_loss: 0.2802 - val_accuracy: 0.8949
Epoch 10/100
36/36 - 0s - loss: 0.2509 - accuracy: 0.9025 - val_loss: 0.2789 - val_accuracy: 0.8881
Epoch 11/

<tensorflow.python.keras.callbacks.History at 0x7fd5a9e93518>

###Testing

In [0]:
test_loss, test_accuracy = model.evaluate(test_inputs,test_targets)



In [0]:
print("Tesing loss: {0: .2f}, Testing Accuracy: {1: .2f}%" .format(test_loss, test_accuracy*100))

Tesing loss:  0.25, Testing Accuracy:  90.85%


##Conclusion

We obtained around 90% accuracy for predicting the sales by customers