# Customer Conversion Prediction

In [1]:
import pandas as pd 
import numpy as np 
import os
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv(os.path.join('data', 'audiobook_data.csv'))

In [3]:
df.head()

Unnamed: 0,id,book_length_(mins)_overal,book_length_(min)_avg,price_overall,price_avg,review,review_10_10,minutes_listened,completion,support_requests,last_visited_minus_purchase_date,target
0,873,2160.0,2160,10.13,10.13,0,8.91,0.0,0.0,0,0,1
1,611,1404.0,2808,6.66,13.33,1,6.5,0.0,0.0,0,182,1
2,705,324.0,324,10.13,10.13,1,9.0,0.0,0.0,1,334,1
3,391,1620.0,1620,15.31,15.31,0,9.0,0.0,0.0,0,183,1
4,819,432.0,1296,7.11,21.33,1,9.0,0.0,0.0,0,0,1


In [4]:
print('duplicated values:', df.duplicated().sum())
print('null values:', df.isna().sum().sum())

duplicated values: 0
null values: 0


Dataset has class imbalance for the target value.

In [5]:
df.target.value_counts()

0    11847
1     2237
Name: target, dtype: int64

### Downsampling the 0 class

In [6]:
from sklearn.utils import resample

In [7]:
target_0_downsampled = resample(df[df.target == 0], 
                        n_samples=len(df[df.target == 1]),
                        replace=False,
                        random_state=42)
target_1 = df[df.target == 1]

In [8]:
df_balanced = pd.concat([target_0_downsampled, target_1]).sample(frac=1)

In [9]:
# dropping id since it's not helpful in modeling
df_balanced.target.value_counts()

1    2237
0    2237
Name: target, dtype: int64

In [10]:
df_balanced.drop(['id'], axis=1, inplace=True)


## Scaling the inputs

In [11]:
ss = StandardScaler()

X=df_balanced.drop('target', axis=1)
X = ss.fit_transform(X)
y = df_balanced.target.values

## Train/Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.05, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=42)

In [13]:
X_train.shape, X_test.shape, X_val.shape

((4037, 10), (224, 10), (213, 10))

In [14]:
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [15]:
output_size=2
batch_size = 100
max_epochs = 100
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5)

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dense(output_size, activation='softmax'))
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=batch_size, epochs=max_epochs,
          callbacks=[early_stopping],
          validation_data=(X_val, y_val),
          verbose=2
          );

Train on 4037 samples, validate on 213 samples
Epoch 1/100
4037/4037 - 0s - loss: 0.6150 - accuracy: 0.6611 - val_loss: 0.5030 - val_accuracy: 0.7371
Epoch 2/100
4037/4037 - 0s - loss: 0.4657 - accuracy: 0.7644 - val_loss: 0.4391 - val_accuracy: 0.7512
Epoch 3/100
4037/4037 - 0s - loss: 0.4102 - accuracy: 0.7785 - val_loss: 0.4087 - val_accuracy: 0.7793
Epoch 4/100
4037/4037 - 0s - loss: 0.3908 - accuracy: 0.7954 - val_loss: 0.4055 - val_accuracy: 0.8028
Epoch 5/100
4037/4037 - 0s - loss: 0.3793 - accuracy: 0.7989 - val_loss: 0.4040 - val_accuracy: 0.7981
Epoch 6/100
4037/4037 - 0s - loss: 0.3740 - accuracy: 0.8011 - val_loss: 0.4000 - val_accuracy: 0.7934
Epoch 7/100
4037/4037 - 0s - loss: 0.3689 - accuracy: 0.8006 - val_loss: 0.4097 - val_accuracy: 0.7793
Epoch 8/100
4037/4037 - 0s - loss: 0.3645 - accuracy: 0.8033 - val_loss: 0.3913 - val_accuracy: 0.7887
Epoch 9/100
4037/4037 - 0s - loss: 0.3635 - accuracy: 0.8048 - val_loss: 0.3935 - val_accuracy: 0.8028
Epoch 10/100
4037/4037 - 0

## Evaluation of the model on the test dataset

In [16]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print('Test Loss: %.2f' %test_loss)
print('Test Accuracy: %.2f' %(test_accuracy*100))

Test Loss: 0.35
Test Accuracy: 81.70


### Results Interpretation

Test Results show that near 8 out of every 10 clients are classified correctly in their conversion. 

## Obtain the probability for a customer to convert

In [17]:
np.round(model.predict(X_test),2)
#resulting data shows the probability of the clients not converting ad converting, so we can choose the second column

array([[0.44, 0.56],
       [0.99, 0.01],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [0.24, 0.76],
       [0.  , 1.  ],
       [0.38, 0.62],
       [0.38, 0.62],
       [0.24, 0.76],
       [0.7 , 0.3 ],
       [0.7 , 0.3 ],
       [0.16, 0.84],
       [0.6 , 0.4 ],
       [0.  , 1.  ],
       [0.12, 0.88],
       [1.  , 0.  ],
       [0.23, 0.77],
       [0.86, 0.14],
       [0.9 , 0.1 ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.79, 0.21],
       [0.  , 1.  ],
       [0.5 , 0.5 ],
       [0.7 , 0.3 ],
       [0.76, 0.24],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [0.29, 0.71],
       [0.14, 0.86],
       [0.04, 0.96],
       [0.  , 1.  ],
       [0.72, 0.28],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.59, 0.41],
       [1.  , 0.  ],
       [0.5 , 0.5 ],
       [0.7 , 0.3 ],
       [0.59, 0.41],
       [1.  , 0.  ],
       [0.17, 0.83],
       [0.01, 0.99],
       [0.59,

We would like to know the column of the class for which the observation yields the highest probability and not the probability of getting one. By utilizing arguments of the Maxima we can find the index where we find the highest argument.

In [18]:
model.predict_classes(X_test)

array([1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0], dtype=int64)

## Saving the model

In [19]:
try:
    os.mkdir('model')
except:
    pass
model.save(os.path.join('model', 'audiobooks_model.h5'))

In [20]:
## load model
# model = tf.keras.models.load_model(os.path.join('model', 'audiobooks_model.h5'))