# Customer Analytics

We have a dataset of audiobook purchase information from various customers. Our goal is to identify if a customer is likely to purchase an audiobook again.

In [21]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle

raw_csv_data = np.loadtxt('Audiobooks_data.csv',delimiter=',')
unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

## Balancing the Dataset
We have too many customers who did not convert after purchasing an audiobook, we need to balance the dataset

In [22]:
num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = [] 

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

## Standardize the inputs

In [23]:
scaler_deep_learning = StandardScaler()
scaled_inputs = scaler_deep_learning.fit_transform(unscaled_inputs_equal_priors)

## Shuffle the data

In [24]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

## Splitting the fata into train, test and validation

In [25]:
samples_count = shuffled_inputs.shape[0]

train_samples_count = int(0.8*samples_count)
validations_samples_count = int(0.1*samples_count)
test_samples_count = samples_count-validations_samples_count-train_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

val_inputs = shuffled_inputs[train_samples_count:train_samples_count+validations_samples_count]
val_targets = shuffled_targets[train_samples_count:train_samples_count+validations_samples_count]


test_inputs = shuffled_inputs[train_samples_count+validations_samples_count:]
test_targets = shuffled_targets[train_samples_count+validations_samples_count:]

print(np.sum(train_targets), train_samples_count, np.sum(train_targets)/train_samples_count)
print(np.sum(val_targets), validations_samples_count, np.sum(val_targets)/validations_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets)/test_samples_count)

1795.0 3579 0.5015367421067337
219.0 447 0.4899328859060403
223.0 448 0.49776785714285715


## Save the three datasets in *.npz

In [26]:
np.savez('Audiobooks_data_train',inputs=train_inputs,targets=train_targets)
np.savez('Audiobooks_data_validation',inputs=val_inputs,targets=val_targets)
np.savez('Audiobooks_data_test',inputs=test_inputs,targets=test_targets)

## Creating the Machine Learning model

In [27]:
import numpy as np
import tensorflow as tf

In [28]:
#load data from .npz files if necessary

input_size=10
output_size=2
hidden_layer_size=50

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), #verify sigmoid/tanh as well
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax')    
])

model.compile(optimizer='adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

batch_size=100
max_epochs=100

early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model.fit(train_inputs,
         train_targets,
         batch_size=batch_size,
         epochs=max_epochs,
         callbacks=[early_stopping],
         validation_data=(val_inputs,val_targets),
         verbose=2
         )

Train on 3579 samples, validate on 447 samples
Epoch 1/100
3579/3579 - 0s - loss: 0.6059 - acc: 0.6655 - val_loss: 0.5171 - val_acc: 0.7405
Epoch 2/100
3579/3579 - 0s - loss: 0.4657 - acc: 0.7544 - val_loss: 0.4503 - val_acc: 0.7651
Epoch 3/100
3579/3579 - 0s - loss: 0.4116 - acc: 0.7765 - val_loss: 0.4107 - val_acc: 0.7785
Epoch 4/100
3579/3579 - 0s - loss: 0.3842 - acc: 0.7960 - val_loss: 0.4024 - val_acc: 0.7830
Epoch 5/100
3579/3579 - 0s - loss: 0.3714 - acc: 0.7963 - val_loss: 0.3852 - val_acc: 0.7919
Epoch 6/100
3579/3579 - 0s - loss: 0.3607 - acc: 0.8055 - val_loss: 0.3857 - val_acc: 0.7830
Epoch 7/100
3579/3579 - 0s - loss: 0.3539 - acc: 0.8053 - val_loss: 0.3803 - val_acc: 0.7830
Epoch 8/100
3579/3579 - 0s - loss: 0.3478 - acc: 0.8094 - val_loss: 0.3628 - val_acc: 0.7942
Epoch 9/100
3579/3579 - 0s - loss: 0.3435 - acc: 0.8086 - val_loss: 0.3590 - val_acc: 0.7964
Epoch 10/100
3579/3579 - 0s - loss: 0.3401 - acc: 0.8139 - val_loss: 0.3743 - val_acc: 0.7987
Epoch 11/100
3579/3579

<tensorflow.python.keras.callbacks.History at 0x1a2dbecdd0>

In [29]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



## Probability of a customer to purchase an audiobook

In [30]:
model.predict(test_inputs).round(2)
#first column depicts probability of not purchasing, and 2nd column shows probability of purchasing

array([[0.99, 0.01],
       [0.05, 0.95],
       [0.58, 0.42],
       [0.24, 0.76],
       [0.99, 0.01],
       [0.44, 0.56],
       [0.67, 0.33],
       [0.35, 0.65],
       [0.31, 0.69],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.48, 0.52],
       [0.59, 0.41],
       [0.99, 0.01],
       [0.62, 0.38],
       [1.  , 0.  ],
       [0.39, 0.61],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.2 , 0.8 ],
       [0.35, 0.65],
       [0.01, 0.99],
       [0.62, 0.38],
       [0.62, 0.38],
       [0.41, 0.59],
       [0.52, 0.48],
       [0.32, 0.68],
       [0.  , 1.  ],
       [0.58, 0.42],
       [0.58, 0.42],
       [0.01, 0.99],
       [0.35, 0.65],
       [0.36, 0.64],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.38, 0.62],
       [0.57, 0.43],
       [0.33, 0.67],
       [0.7 , 0.3 ],
       [0.4 , 0.6 ],
       [0.62, 0.38],
       [0.97, 0.03],
       [0.64, 0.36],
       [0.29, 0.71],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.39, 0.61],
       [0.44,