In [17]:
reset -fs

In [57]:
from sklearn.datasets import fetch_covtype # dataset
from sklearn.model_selection import train_test_split # split dataset into training/test sets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
import numpy as np

In [32]:
# download the dataset from:
# "http://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
cover_type = fetch_covtype() 

In [33]:
cover_type.DESCR

'Forest covertype dataset.\n\nA classic dataset for classification benchmarks, featuring categorical and\nreal-valued features.\n\nThe dataset page is available from UCI Machine Learning Repository\n\n    http://archive.ics.uci.edu/ml/datasets/Covertype\n\nCourtesy of Jock A. Blackard and Colorado State University.\n'

In [34]:
cover_type.target

array([5, 5, 2, ..., 3, 3, 3], dtype=int32)

In [35]:
cover_type.target.shape

(581012,)

In [36]:
# from the Forest_Cover_Type.ipynb data exploration we discovered there are 7 distinct cover_types
# set these covertypes as our target, y 
y = cover_type.target

In [37]:
cover_type.data.shape

(581012, 54)

In [38]:
# Our data contains 54 features. Explored in depth within the Forest_Cover_Type.ipynb
# set this 581012 x 54 matrix as our feature matrix, X
X = cover_type.data

In [46]:
# since our dataset is rather large, and we will be doing cross validation on our training set,
# we set the train_size parameter to be 90% and set aside %10 to test on
# due to large class imbalances in our target matrix, we
# set the stratify parameter=y. this makes a split so that the proportion of classes in the 
# test and train sets will be similar
X_train, X_test, y_train, y_test=\
                            train_test_split(X, y, train_size=.90, random_state=42, stratify=y)

In [49]:
print(X_train.shape, X_test.shape)

(522910, 54) (58102, 54)


In [54]:
pipe_rfclf = Pipeline([('scl', StandardScaler()),
            ('clf', RandomForestClassifier(random_state=1))])

pipe_rfclf.fit(X_train, y_train)

Pipeline(steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=1,
            verbose=0, warm_start=False))])

In [55]:
pipe_rfclf.get_params().keys()

dict_keys(['steps', 'scl', 'clf', 'scl__copy', 'scl__with_mean', 'scl__with_std', 'clf__bootstrap', 'clf__class_weight', 'clf__criterion', 'clf__max_depth', 'clf__max_features', 'clf__max_leaf_nodes', 'clf__min_impurity_split', 'clf__min_samples_leaf', 'clf__min_samples_split', 'clf__min_weight_fraction_leaf', 'clf__n_estimators', 'clf__n_jobs', 'clf__oob_score', 'clf__random_state', 'clf__verbose', 'clf__warm_start'])

In [58]:
features = X_train.shape[1]
param_grid = dict(clf__n_estimators=[50, 100, 250, 500],
                  clf__max_depth=np.arange(2,8),
                    clf__max_features=np.arange(1, features+1))

grid = GridSearchCV(estimator=pipe_rfclf,
                     param_grid=param_grid,
                     cv=5,
                     verbose=1)

grid = grid.fit(X_train, y_train)
               
print(grid.best_score_)
print(grid.best_params_)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


KeyboardInterrupt: 