In [19]:
import os
import numpy as np
from glob import glob
import dlib
from scipy.misc import *
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
%matplotlib inline

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

---

In [3]:
X_train = np.load('X_train.npy')
X_val = np.load('X_val.npy')
y_train = np.load('y_train.npy')
y_val = np.load('y_val.npy')

print X_train.shape, y_train.shape
print X_val.shape, y_val.shape

(3424, 512) (3424,)
(1946, 512) (1946,)


In [42]:
predictor = OneVsRestClassifier(RandomForestClassifier(n_estimators=1000,
                                                       n_jobs=16)).fit(X_train, y_train)

In [43]:
print np.mean(y_val == predictor.predict(X_val))

0.693730729702


In [16]:
predictor = OneVsRestClassifier(GradientBoostingClassifier(n_estimators=100)).fit(X_train, y_train)

In [17]:
print np.mean(y_val == predictor.predict(X_val))

0.683967112025


In [20]:
predictor = Pipeline([
  ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=1000, class_weight='balanced'))),
  ('classification', OneVsRestClassifier(GradientBoostingClassifier(n_estimators=100)))
])
predictor.fit(X_train, y_train)

Pipeline(steps=[('feature_selection', SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
  ... random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
          n_jobs=1))])

In [21]:
print np.mean(y_val == predictor.predict(X_val))

0.681397738952


In [34]:
predictor = Pipeline([
  ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=1000, class_weight='balanced'))),
  ('classification', OneVsRestClassifier(RandomForestClassifier(n_estimators=100)))
])
predictor.fit(X_train, y_train)

Pipeline(steps=[('feature_selection', SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
  ... oob_score=False, random_state=None,
            verbose=0, warm_start=False),
          n_jobs=1))])

In [35]:
print np.mean(y_val == predictor.predict(X_val))

0.680369989723
