In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
train_df = pd.read_csv("working_data/trial_promo_training_original.csv")
test_df = pd.read_csv("working_data/trial_promo_testing.csv")

In [3]:
features_to_scale = ["age", "income", "avbal", "avtrans"]
columns_to_drop_for_nn = ["sexNum", "mstatusNum", "occupationNum", "educationNum", "childrenNum"]
min_max_scaler = preprocessing.MinMaxScaler()
train_df[features_to_scale] = min_max_scaler.fit_transform(train_df[features_to_scale])

train_df.head()

Unnamed: 0,age,income,avbal,avtrans,decision,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,0.481241,0.120073,0.273538,0.324016,,0,1,6,0,2,...,0,1,0,0,0,0,0,1,0,0
1,0.035354,0.120073,0.273538,0.324016,,1,2,1,2,0,...,0,0,0,1,0,1,0,0,0,0
2,0.490188,0.026902,0.257021,0.1516,,0,1,0,2,0,...,0,0,0,1,0,1,0,0,0,0
3,0.111688,0.120073,0.273538,0.324016,A,1,1,1,2,0,...,0,0,0,1,0,1,0,0,0,0
4,0.135354,0.193942,0.260347,0.138515,,1,2,2,3,0,...,0,0,0,0,1,1,0,0,0,0


In [4]:
X_train = train_df.loc[:, train_df.columns != "decision"]
X_train = X_train.drop(columns_to_drop_for_nn, axis=1)
y_train = train_df["decision"].replace("None", 0).replace("A", 1).replace("B", 2)
X_train.head()

Unnamed: 0,age,income,avbal,avtrans,sex_F,sex_M,mstatus_divorced,mstatus_married,mstatus_single,mstatus_widowed,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,0.481241,0.120073,0.273538,0.324016,1,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,0.035354,0.120073,0.273538,0.324016,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,0,0
2,0.490188,0.026902,0.257021,0.1516,1,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
3,0.111688,0.120073,0.273538,0.324016,0,1,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
4,0.135354,0.193942,0.260347,0.138515,0,1,0,0,1,0,...,0,0,0,0,1,1,0,0,0,0


In [5]:
y_train.head()

0    0
1    0
2    0
3    1
4    0
Name: decision, dtype: int64

In [6]:
test_df[features_to_scale] = min_max_scaler.transform(test_df[features_to_scale])
test_df.head()

X_test = test_df.loc[:, test_df.columns != "decision"]
X_test = X_test.drop(columns_to_drop_for_nn, axis=1)
y_test = test_df["decision"].replace("None", 0).replace("A", 1).replace("B", 2)
X_test.head()

Unnamed: 0,age,income,avbal,avtrans,sex_F,sex_M,mstatus_divorced,mstatus_married,mstatus_single,mstatus_widowed,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,0.704906,0.080506,0.063522,0.100964,1,0,0,1,0,0,...,1,0,0,1,0,1,0,0,0,0
1,0.139105,0.08853,0.232708,0.122881,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,0,0
2,0.514719,0.300505,0.33876,0.089771,1,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,0.148052,0.346235,0.180242,0.390105,1,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,0
4,0.42583,0.5569,0.231039,0.484708,1,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0


### Neural network

In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### Training accuracy

In [8]:
clf = MLPClassifier(activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 8, 4), random_state= 42)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
print(confusion_matrix(y_train, y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[593  25   3]
 [ 37  63   0]
 [ 10   0  19]]
0.9


### Testing accuracy

In [9]:
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred, labels=[0, 1, 2]))
print(accuracy_score(y_test, y_test_pred))

[[181  20   6]
 [ 24   9   0]
 [  8   0   2]]
0.768


In [10]:
from sklearn.ensemble import RandomForestClassifier
columns_for_rf = ["age", "income", "avbal", "avtrans", "sexNum", "mstatusNum", "occupationNum", "educationNum", "childrenNum"]
X_train = train_df[columns_for_rf]
y_train = train_df["decision"].replace("None", 0).replace("A", 1).replace("B", 2)

X_train.head()

Unnamed: 0,age,income,avbal,avtrans,sexNum,mstatusNum,occupationNum,educationNum,childrenNum
0,0.481241,0.120073,0.273538,0.324016,0,1,6,0,2
1,0.035354,0.120073,0.273538,0.324016,1,2,1,2,0
2,0.490188,0.026902,0.257021,0.1516,0,1,0,2,0
3,0.111688,0.120073,0.273538,0.324016,1,1,1,2,0
4,0.135354,0.193942,0.260347,0.138515,1,2,2,3,0


In [11]:
clf = RandomForestClassifier(max_depth=2, random_state=42, n_estimators=1000)
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
print(y_train_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [12]:
print(confusion_matrix(y_train, y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[621   0   0]
 [100   0   0]
 [ 29   0   0]]
0.828


In [13]:
X_test = test_df[columns_for_rf]
y_test = test_df["decision"].replace("None", 0).replace("A", 1).replace("B", 2)

y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(accuracy_score(y_test, y_test_pred))

[[207   0   0]
 [ 33   0   0]
 [ 10   0   0]]
0.828


### Build a neural network using Keras

In [15]:
import keras
from keras.models import Sequential
from keras.layers import Dense
# fix random seed for reproducibility
np.random.seed(7)

In [16]:
# create model
num_features = X_train.shape[1]
model = Sequential()
model.add(Dense(num_features + 4, input_dim=num_features, activation='relu'))
model.add(Dense(num_features//2, activation='relu'))
model.add(Dense(3, activation='sigmoid'))

In [17]:
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
y_train_one_hot = keras.utils.to_categorical(y_train, num_classes=3)

In [22]:
model.fit(X_train, y_train_one_hot, epochs=50, batch_size=10)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1dc98408160>

In [23]:
y_test_one_hot = keras.utils.to_categorical(y_test, num_classes=3)

# evaluate the model
scores = model.evaluate(X_test, y_test_one_hot)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


acc: 82.80%


In [31]:
keras_nn_predictions = model.predict(X_test)
len(keras_nn_predictions)
keras_nn_predictions = np.argmax(keras_nn_predictions, axis=1)
len(keras_nn_predictions)
keras_nn_predictions
print(confusion_matrix(y_test, keras_nn_predictions, labels=[0, 1, 2]))
print(accuracy_score(y_test, keras_nn_predictions))

[[207   0   0]
 [ 33   0   0]
 [ 10   0   0]]
