In [65]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [66]:
train_df = pd.read_csv("working_data/trial_promo_training_original.csv")
test_df = pd.read_csv("working_data/trial_promo_testing.csv")

In [67]:
features_to_scale = ["age", "income", "avbal", "avtrans"]
columns_to_drop_for_nn = ["sexNum", "mstatusNum", "occupationNum", "educationNum", "childrenNum"]
min_max_scaler = preprocessing.MinMaxScaler()
train_df[features_to_scale] = min_max_scaler.fit_transform(train_df[features_to_scale])

train_df.head()

Unnamed: 0,age,income,avbal,avtrans,decision,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,0.481241,0.120073,0.273538,0.324016,,0,1,6,0,2,...,0,1,0,0,0,0,0,1,0,0
1,0.035354,0.120073,0.273538,0.324016,,1,2,1,2,0,...,0,0,0,1,0,1,0,0,0,0
2,0.490188,0.026902,0.257021,0.1516,,0,1,0,2,0,...,0,0,0,1,0,1,0,0,0,0
3,0.111688,0.120073,0.273538,0.324016,A,1,1,1,2,0,...,0,0,0,1,0,1,0,0,0,0
4,0.135354,0.193942,0.260347,0.138515,,1,2,2,3,0,...,0,0,0,0,1,1,0,0,0,0


In [68]:
X_train = train_df.loc[:, train_df.columns != "decision"]
X_train = X_train.drop(columns_to_drop_for_nn, axis=1)
y_train = train_df["decision"].replace("None", 0).replace("A", 1).replace("B", 2)
X_train.head()

Unnamed: 0,age,income,avbal,avtrans,sex_F,sex_M,mstatus_divorced,mstatus_married,mstatus_single,mstatus_widowed,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,0.481241,0.120073,0.273538,0.324016,1,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,0.035354,0.120073,0.273538,0.324016,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,0,0
2,0.490188,0.026902,0.257021,0.1516,1,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
3,0.111688,0.120073,0.273538,0.324016,0,1,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
4,0.135354,0.193942,0.260347,0.138515,0,1,0,0,1,0,...,0,0,0,0,1,1,0,0,0,0


In [69]:
y_train.head()

0    0
1    0
2    0
3    1
4    0
Name: decision, dtype: int64

In [70]:
test_df[features_to_scale] = min_max_scaler.transform(test_df[features_to_scale])
test_df.head()

X_test = test_df.loc[:, test_df.columns != "decision"]
X_test = X_test.drop(columns_to_drop_for_nn, axis=1)
y_test = test_df["decision"].replace("None", 0).replace("A", 1).replace("B", 2)
X_test.head()

Unnamed: 0,age,income,avbal,avtrans,sex_F,sex_M,mstatus_divorced,mstatus_married,mstatus_single,mstatus_widowed,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,0.704906,0.080506,0.063522,0.100964,1,0,0,1,0,0,...,1,0,0,1,0,1,0,0,0,0
1,0.139105,0.08853,0.232708,0.122881,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,0,0
2,0.514719,0.300505,0.33876,0.089771,1,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,0.148052,0.346235,0.180242,0.390105,1,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,0
4,0.42583,0.5569,0.231039,0.484708,1,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0


### Neural network

In [71]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### Training accuracy

In [72]:
clf = MLPClassifier(activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 8, 4), random_state= 42)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
print(confusion_matrix(y_train, y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[593  25   3]
 [ 37  63   0]
 [ 10   0  19]]
0.9


### Testing accuracy

In [73]:
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred, labels=[0, 1, 2]))
print(accuracy_score(y_test, y_test_pred))

[[181  20   6]
 [ 24   9   0]
 [  8   0   2]]
0.768


In [74]:
from sklearn.ensemble import RandomForestClassifier
columns_for_rf = ["age", "income", "avbal", "avtrans", "sexNum", "mstatusNum", "occupationNum", "educationNum", "childrenNum"]
X_train = train_df[columns_for_rf]
y_train = train_df["decision"].replace("None", 0).replace("A", 1).replace("B", 2)

X_train.head()

Unnamed: 0,age,income,avbal,avtrans,sexNum,mstatusNum,occupationNum,educationNum,childrenNum
0,0.481241,0.120073,0.273538,0.324016,0,1,6,0,2
1,0.035354,0.120073,0.273538,0.324016,1,2,1,2,0
2,0.490188,0.026902,0.257021,0.1516,0,1,0,2,0
3,0.111688,0.120073,0.273538,0.324016,1,1,1,2,0
4,0.135354,0.193942,0.260347,0.138515,1,2,2,3,0


In [78]:
clf = RandomForestClassifier(max_depth=2, random_state=42, n_estimators=1000)
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
print(y_train_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [76]:
print(confusion_matrix(y_train, y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[621   0   0]
 [100   0   0]
 [ 29   0   0]]
0.828


In [77]:
X_test = test_df[columns_for_rf]
y_test = test_df["decision"].replace("None", 0).replace("A", 1).replace("B", 2)

y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(accuracy_score(y_test, y_test_pred))

[[207   0   0]
 [ 33   0   0]
 [ 10   0   0]]
0.828
