In this notebook I will consider 5 classifiers to predict actions based on state information. Initially I will test their accuracy on the training data but later in the notebook I make my final evaluation on their performance on the unseen test data. This evaluation is completed in one cell for readability but could have also been done when training data was being evaluated for each classifier

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn import metrics
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Activation, Merge, Dropout, Flatten
from keras.optimizers import RMSprop, adam

from sklearn.externals import joblib


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def frange(start, stop, step):
     i = start
     while i < stop:
         yield i
         i += step

In [3]:
data_sampling_rate = 0.5

In [4]:
dataset = pd.read_csv('LunarLAnderStateVectors.csv')
dataset = dataset.sample(frac=data_sampling_rate) #take a sample from the dataset so everyhting runs smoothly
num_classes = 4
classes = {0: "None", 1:"Up", 2: "Left", 3:"Right"}
display(dataset.head())

Unnamed: 0,step,pos_x,pos_y,vel_x,vel_y,ship_lander_angle,ship_lander_angular_vel,leg_1_ground_contact,leg_2_ground_contact,action
86816,76,0.173908,0.416632,-0.172839,-0.435367,0.226505,-0.142227,0.0,0.0,2
140657,122,0.099441,0.216432,-0.351624,-0.242789,-0.0701,-0.150692,0.0,0.0,0
9676,438,-0.76285,0.090831,-0.445371,0.205027,-0.254618,-0.097402,0.0,0.0,2
182420,1312,0.231822,0.054275,0.028891,-0.006563,0.088737,0.062424,0.0,0.0,0
191860,128,0.034734,0.141237,-0.311238,-0.246594,-0.198355,-0.057676,0.0,0.0,2


In [5]:
X = dataset[dataset.columns[:-1]]
Y = np.array(dataset["action"])
del X["step"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

Fitting a decision tree classifier

In [6]:
# Set up the parameter grid to seaerch
param_grid ={'criterion': ['gini', "entropy"], \
             'max_depth': list(range(3, 20, 3)), \
             'min_samples_split': [5,10,20,50] }

# Perform the search
my_tuned_tree = GridSearchCV(tree.DecisionTreeClassifier(), \
                                param_grid, cv=5, verbose = 0, \
                            return_train_score=True)
my_tuned_tree.fit(X_train, Y_train)

# Print details
print("Best parameters set found on development set:")
display(my_tuned_tree.best_params_)
display(my_tuned_tree.best_score_)
display(my_tuned_tree.cv_results_)

#about 80% accuracy

Best parameters set found on development set:


{'criterion': 'gini', 'max_depth': 18, 'min_samples_split': 10}

0.84913834029624

{'mean_fit_time': array([0.14686346, 0.12785399, 0.13654196, 0.13573647, 0.2686584 ,
        0.23207748, 0.22453439, 0.24309957, 0.32598794, 0.31247342,
        0.53615999, 0.44737494, 0.53211999, 0.41102552, 0.38347745,
        0.39058101, 0.46206248, 0.47428596, 0.45007443, 0.455585  ,
        0.59203947, 0.57077396, 0.56840801, 0.52242196, 0.20179904,
        0.20936954, 0.19710648, 0.19878697, 0.35131991, 0.35056162,
        0.33425748, 0.3793025 , 1.21491146, 0.54101896, 0.54709947,
        0.57474744, 0.88090992, 0.58439338, 0.5509181 , 0.61987054,
        0.6657865 , 0.700773  , 0.66014206, 0.66323197, 1.17833507,
        0.76869702, 0.87402999, 0.71141303]),
 'mean_score_time': array([0.00636303, 0.00523901, 0.00546944, 0.006091  , 0.00575304,
        0.00617301, 0.00573206, 0.00675499, 0.00664055, 0.00842059,
        0.01395345, 0.00744748, 0.01387298, 0.00870705, 0.0088551 ,
        0.00840449, 0.01027596, 0.00979197, 0.00997996, 0.00882006,
        0.012622  , 0.0214864 , 0.

In [7]:
# Best decision tree parameters
decision_tree = tree.DecisionTreeClassifier(criterion="gini", max_depth= 18 , min_samples_split= 10)
decision_tree.fit(X_train, Y_train)
pred = decision_tree.predict(X_test)
metrics.accuracy_score(pred, Y_test)

0.8606746987951808

Fitting a random forest classifier 

In [8]:
# Set up the parameter grid to seaerch
param_grid = [
 {'n_estimators': list(range(200, 501, 100)), 'max_features': list(range(1, 7, 3)), 'min_samples_split': [5,10] }
]

# Perform the search
my_tuned_model = GridSearchCV(ensemble.RandomForestClassifier(), param_grid, cv=2)
my_tuned_model.fit(X_train, Y_train)

# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
print(my_tuned_model.best_score_)

Best parameters set found on development set:
{'max_features': 4, 'n_estimators': 300, 'min_samples_split': 5}
0.8910344663881504


In [9]:
# Best random forest parameters
random_forest = ensemble.RandomForestClassifier(min_samples_split=5, max_features=4, n_estimators=300)
random_forest.fit(X_train, Y_train)
pred = random_forest.predict(X_test)
metrics.accuracy_score(pred, Y_test)

0.9045542168674698

Bagging classifier 

In [10]:
#bagging
# Set up the parameter grid to seaerch
param_grid = [
 {'n_estimators': list(range(5, 25, 1)), "max_samples":list(frange(0.5,1,0.1)), "max_features": list(frange(0.5,1,0.1))}
]

# Perform the search
my_tuned_model = GridSearchCV(ensemble.BaggingClassifier(), param_grid, cv=5)
my_tuned_model.fit(X_train, Y_train)

# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
print(my_tuned_model.best_score_)

Best parameters set found on development set:
{'max_features': 0.8999999999999999, 'max_samples': 0.8999999999999999, 'n_estimators': 21}
0.893966008355488


In [11]:
# Best bagging parameters
bagging = ensemble.BaggingClassifier(n_estimators=21, max_features = 0.9, max_samples=0.9)
bagging.fit(X_train, Y_train)
pred = bagging.predict(X_test)
metrics.accuracy_score(pred, Y_test)

0.903421686746988

Boosting Classifier 

In [12]:
# Using decision tree classifier with boosting
boosting = ensemble.AdaBoostClassifier(base_estimator = tree.DecisionTreeClassifier(criterion="gini", max_depth= 9, min_samples_leaf = 50), \
                                       n_estimators=500)

boosting.fit(X_train, Y_train)
pred = boosting.predict(X_train)
metrics.accuracy_score(pred, Y_train)

0.940289118875807

Neural network classifier 

In [13]:
model_mlp = Sequential()
model_mlp.add(Dense(512, input_shape=(8,)))
model_mlp.add(Activation('relu'))
model_mlp.add(Dropout(0.2))
model_mlp.add(Dense(512))
model_mlp.add(Activation('relu'))
model_mlp.add(Dropout(0.2))
model_mlp.add(Dense(4))
model_mlp.add(Activation('softmax'))
model_mlp.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               4608      
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_2 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 2052      
__________

In [14]:
model_mlp.compile(loss='categorical_crossentropy', optimizer=adam(lr=1e-3), metrics=['accuracy'])


In [15]:
# Splitting training data into training and validation data for mlp
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=0, test_size = 0.3, train_size = 0.7)

In [16]:
# Convert the singl column label into a dummy coded label
y_train = to_categorical(np.asarray(y_train))
y_valid_wide = to_categorical(np.asarray(y_valid))
model_mlp.fit(np.asfarray(x_train), np.asfarray(y_train), \
          epochs=20, batch_size=32, verbose=1, \
          validation_data=(np.asfarray(x_valid), np.asfarray(y_valid_wide)))

Train on 58979 samples, validate on 25277 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x124e99a58>

In [17]:
# Make a set of predictions for the training data
y_pred = model_mlp.predict_classes(np.asfarray(X_train), batch_size=32)

# Print performance details
print(metrics.accuracy_score(Y_train, y_pred))

0.9577715533611849


Now to test which classifier performs best and retrain the chosen classifier on the full dataset

In [18]:
A = {"Decision Tree": decision_tree, "Random Forest": random_forest, "Bagging": bagging, "Boosting": boosting, "Neural Net": model_mlp}

In [19]:
# Evaluating the performance of all of the classifiers 
accuracy_table = pd.DataFrame(index = ["Accuracy"], columns=A.keys())
maximum = 0
for keys, items in A.items():
    if keys == "Neural Net":
        pred = items.predict_classes(np.asfarray(X_test))
        X = metrics.accuracy_score(pred, Y_test)
    else:
        pred = items.predict(X_test)
        X = metrics.accuracy_score(pred, Y_test)
    print("Confusion Matrix for", keys)
    print(metrics.confusion_matrix(Y_test, pred))
    print("Accuracy for", keys)
    print(X, "\n")
    if X>maximum:
        maximum = X
        classifier = items
        name = keys
    accuracy_table.loc["Accuracy",keys] = X

Confusion Matrix for Decision Tree
[[17050   168  2196   186]
 [  339   734   425     4]
 [ 1240   241 17234   205]
 [  360     0   418   700]]
Accuracy for Decision Tree
0.8606746987951808 

Confusion Matrix for Bagging
[[17909    48  1615    28]
 [  400   684   418     0]
 [  565    55 18260    40]
 [  399     0   440   639]]
Accuracy for Bagging
0.903421686746988 

Confusion Matrix for Boosting
[[17779    52  1690    79]
 [  333   850   319     0]
 [ 2343   103 16368   106]
 [  300     0   291   887]]
Accuracy for Boosting
0.8646746987951808 

Confusion Matrix for Neural Net
[[19238     3   297    62]
 [  412   826   264     0]
 [  519     2 18375    24]
 [  159     0   109  1210]]
Accuracy for Neural Net
0.9553975903614458 

Confusion Matrix for Random Forest
[[17849    21  1711    19]
 [  377   648   477     0]
 [  467    28 18407    18]
 [  379     0   464   635]]
Accuracy for Random Forest
0.9045542168674698 



In [20]:
accuracy_table

Unnamed: 0,Decision Tree,Bagging,Boosting,Neural Net,Random Forest
Accuracy,0.860675,0.903422,0.864675,0.955398,0.904554


In [21]:
# Saving the best model
print("Best classifier is:", name, "with accuracy of:", maximum)
if name == "Neural Net":
    filepath = "task1.mod"
    classifier.save(filepath)
else:
    joblib.dump(classifier, 'task1.pkl') 

Best classifier is: Neural Net with accuracy of: 0.9553975903614458
