 # Galaxy Type Classification with Random Forests Supervised Learning

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from support_functions import plot_confusion_matrix, generate_features_targets
from pprint import pprint


 ## Load Dataset to train the system


In [None]:
data = np.load('galaxy_catalogue.npy')

### Print dataset

In [None]:
pprint (data)

 ## Create Train and Test Datasets


 ### Define features (X) and targets (y)
 


In [None]:
features, targets = generate_features_targets(data)

## Configure, train and test ML algorithm, Random Forests

### Create classifier RFC object
#### n_estimators -> represents the number of trees in the forest, max_depth -> represents the depth of each tree in the forest

In [None]:
rfc = RandomForestClassifier(n_estimators=50,max_depth=20)

### Train the classifier (fit the estimator) using the training dataset
#### Use cross_val_predict to split test and training data using k-folds sets randomly (to aviod overffiting). In the basic approach, called Cross Validation k-fold (cv=k), the training set is split into k smaller sets. The following procedure is followed for each of the k “folds”:

#### * A model is trained using k-1 of the folds as training data
#### * The resulting model is validated on the remaining part of the data 


In [None]:
predicted = cross_val_predict(rfc, features, targets, cv=10)
rfc.fit(features, targets)

# Calculate the model score and Confusion Matrix 

In [None]:
model_score = accuracy_score(targets, predicted)
print "Our accuracy score: " +  str(model_score)


In [None]:
# calculate the models confusion matrix using sklearns confusion_matrix function
class_labels = list(set(targets))
print (class_labels)
model_cm = confusion_matrix(y_true=targets, y_pred=predicted, labels=class_labels)


In [None]:
# Plot the confusion matrix using the provided functions.
plt.figure()
plot_confusion_matrix(model_cm, classes=class_labels, normalize=False)
plt.show()


# Classify new unseen data

In [None]:
galaxy_type_prediction = rfc.predict([[ 1.86765 ,    0.68158   ,  0.4131     , 0.3161   ,   0.5954284  , 2.261946  , 2.349849  ,  2.380652  ,  2.359738  ,  2.395528  ,  0.5981129  , 0.4621705 , 0.33337254  ]])
print "Prediction for [ 1.86765 ,    0.68158   ,  0.4131  ...]: ==> " + str(galaxy_type_prediction)


# Tune RFC algorithm

### How sensitive is RFC classification accuracy to the choice of the 'n_estimators' , 'max_depth' and 'cv' parameters?

In [None]:
from mpl_toolkits.mplot3d import Axes3D

md_range = range(1,80,5)
ne_range = range(1,80,5)
cv_range = range(3,20,3)

for cv_value in cv_range:

    xs = []
    ys = []
    zs = []

    for md in md_range:
        for ne in ne_range:
            clf = RandomForestClassifier(n_estimators=ne,max_depth=md)
            predicted = cross_val_predict(clf, features, targets, cv=cv_value)
            clf.fit(features, targets)
            model_score = accuracy_score(targets, predicted)
            xs.append(md)
            ys.append(ne)
            zs.append(model_score)

        
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(xs, ys, zs, c=zs, cmap='Blues', s=100)
    ax.set_xlabel('max_depth')
    ax.set_ylabel('n_estimators')
    ax.set_zlabel('accuracy')
    ax.set_title('cv=%s' %cv_value, fontsize=16)

    plt.show()