# Extra Trees (Extremely Randomized Trees) Ensemble

- RF is built upon decision trees
- Decision tree node splitting is based on gini or entropy 
- Extra-Trees make use of random thresholds for each feature, unlike decision trees

In [4]:
from sklearn.ensemble import ExtraTreesClassifier
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [5]:
xt_clf = ExtraTreesClassifier(random_state=42, n_estimators=100)
df = sns.load_dataset('titanic')
df.dropna(inplace=True)
X = df[['pclass', 'sex', 'age']].copy()
le = preprocessing.LabelEncoder()
X['sex'] = le.fit_transform(df['sex'])
y = df['survived'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [7]:
xt_clf.fit(X_train, y_train);

In [8]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score

def printScore(clf, X_train, X_test, y_train, y_test, train=True):
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        res = clf.predict(X_train)
        print('Train Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_train, res))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, res)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, res)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_train), lb.transform(res))))
    else:
        res_test = clf.predict(X_test)
        print('Test Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_test, res_test))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, res_test)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, res_test)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_test), lb.transform(res_test))))

In [9]:
printScore(xt_clf, X_train, X_test, y_train, y_test)
printScore(xt_clf, X_train, X_test, y_train, y_test, train=False)

Train Results:

Accuracy: 0.94

Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.92      0.91        53
           1       0.96      0.95      0.95       110

    accuracy                           0.94       163
   macro avg       0.93      0.93      0.93       163
weighted avg       0.94      0.94      0.94       163
 

Confusion Matrix: 
 [[ 49   4]
 [  6 104]] 

ROC AUC: 0.9350

Test Results:

Accuracy: 0.84

Classification Report: 
               precision    recall  f1-score   support

           0       0.71      0.83      0.77         6
           1       0.92      0.85      0.88        13

    accuracy                           0.84        19
   macro avg       0.82      0.84      0.82        19
weighted avg       0.85      0.84      0.85        19
 

Confusion Matrix: 
 [[ 5  1]
 [ 2 11]] 

ROC AUC: 0.8397

