In [5]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [6]:
train_df = pd.read_csv("working_data/trial_promo_training.csv")
test_df = pd.read_csv("working_data/trial_promo_testing.csv")

In [7]:
train_df["age"] = preprocessing.scale(train_df["age"])
train_df["income"] = preprocessing.scale(train_df["income"])
train_df["avbal"] = preprocessing.scale(train_df["avbal"])
train_df["avtrans"] = preprocessing.scale(train_df["avtrans"])
train_df.head()

Unnamed: 0,age,income,avbal,avtrans,decision,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,0.694301,-0.713177,-0.068324,0.547857,,0,1,6,0,2,...,0,1,0,0,0,0,0,1,0,0
1,-1.466174,-0.713177,-0.068324,0.547857,,1,2,1,2,0,...,0,0,0,1,0,1,0,0,0,0
2,0.73765,-1.267439,-0.178102,-0.509414,,0,1,0,2,0,...,0,0,0,1,0,1,0,0,0,0
3,-1.096306,-0.713177,-0.068324,0.547857,A,1,1,1,2,0,...,0,0,0,1,0,1,0,0,0,0
4,-0.98164,-0.273744,-0.155999,-0.589653,,1,2,2,3,0,...,0,0,0,0,1,1,0,0,0,0


In [8]:
X_train = train_df.loc[:, train_df.columns != "decision"]
y_train = train_df["decision"].replace("None", 0).replace("A", 1).replace("B", 2)
X_train.head()

Unnamed: 0,age,income,avbal,avtrans,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,sex_F,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,0.694301,-0.713177,-0.068324,0.547857,0,1,6,0,2,1,...,0,1,0,0,0,0,0,1,0,0
1,-1.466174,-0.713177,-0.068324,0.547857,1,2,1,2,0,0,...,0,0,0,1,0,1,0,0,0,0
2,0.73765,-1.267439,-0.178102,-0.509414,0,1,0,2,0,1,...,0,0,0,1,0,1,0,0,0,0
3,-1.096306,-0.713177,-0.068324,0.547857,1,1,1,2,0,0,...,0,0,0,1,0,1,0,0,0,0
4,-0.98164,-0.273744,-0.155999,-0.589653,1,2,2,3,0,0,...,0,0,0,0,1,1,0,0,0,0


In [9]:
y_train.head()

0    0
1    0
2    0
3    1
4    0
Name: decision, dtype: int64

In [10]:
test_df["age"] = preprocessing.scale(test_df["age"])
test_df["income"] = preprocessing.scale(test_df["income"])
test_df["avbal"] = preprocessing.scale(test_df["avbal"])
test_df["avtrans"] = preprocessing.scale(test_df["avtrans"])
test_df.head()

X_test = test_df.loc[:, test_df.columns != "decision"]
y_test = test_df["decision"].replace("None", 0).replace("A", 1).replace("B", 2)
X_test.head()

Unnamed: 0,age,income,avbal,avtrans,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,sex_F,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,1.694579,-1.055675,-1.348957,-0.814692,0,1,8,2,0,1,...,1,0,0,1,0,1,0,0,0,0
1,-1.051218,-1.009078,-0.330144,-0.691148,1,2,4,2,0,0,...,0,0,0,1,0,1,0,0,0,0
2,0.77161,0.222033,0.308487,-0.877786,0,0,0,3,2,1,...,0,0,0,0,1,0,0,1,0,0
3,-1.007801,0.48762,-0.646084,0.815205,0,2,3,3,0,1,...,0,0,0,0,1,1,0,0,0,0
4,0.340238,1.71112,-0.340193,1.348488,0,0,6,1,0,1,...,0,0,1,0,0,1,0,0,0,0


### Neural network

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [9]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)


y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred, labels=[0, 1, 2]))
print(accuracy_score(y_test, y_test_pred))

[[108  69  30]
 [  0  33   0]
 [  3   0   7]]
0.592


In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=42, n_estimators=1000)
clf.fit(X_train, y_train)

y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred, labels=[0, 1, 2]))
print(accuracy_score(y_test, y_test_pred))

[[77 73 57]
 [ 0 33  0]
 [ 0  0 10]]
0.48
