In [1]:
reset -fs

In [2]:
from sklearn.datasets import fetch_covtype # dataset
from sklearn.model_selection import train_test_split # split dataset into training/test sets
from collections import Counter
import numpy as np

In [3]:
# download the dataset from:
# "http://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
cover_type = fetch_covtype() 

In [4]:
cover_type.DESCR

'Forest covertype dataset.\n\nA classic dataset for classification benchmarks, featuring categorical and\nreal-valued features.\n\nThe dataset page is available from UCI Machine Learning Repository\n\n    http://archive.ics.uci.edu/ml/datasets/Covertype\n\nCourtesy of Jock A. Blackard and Colorado State University.\n'

In [5]:
# set these covertypes as our target, y 
y = cover_type.target

"7 distinct cover_types: {}".format(set(y))

'7 distinct cover_types: {1, 2, 3, 4, 5, 6, 7}'

In [6]:
X = cover_type.data

X.shape, y.shape

((581012, 54), (581012,))

###  Forest Covertype Dataset contains 581012 samples with  54 features. Explored in depth: 
https://github.com/adamszabunio/Forest_Cover_Type/tree/master/EDA/Further_EDA_and_Logistic_Regression.ipynb
### Since our dataset is rather large, and we will be doing cross validation on our training set,
- We set the test_size paramater to 0.1 set aside %10 to test the trained models accuracy.
- Due to large class imbalances in our target matrix, we set the stratify parameter=y. 
    - This makes a split so that the proportion of classes in the test and train sets will be similar

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.1, 
                                                    random_state=42, # for reproducibility
                                                    stratify=y)

In [8]:
print(X_train.shape, X_test.shape)

(522910, 54) (58102, 54)


### Fitting Training data with scikit learn's RandomForestClassifier
### Choosing this algorithm over others due to our feature space. 
- 44 of our features are binary, 
    - whether or not the tree is in one of 4 wilderness areas
    - whether or not the tree is found in one of 40 soil types
- The remaining 10 features:
    - Elevation: Elevation in meters
    - Aspect: Aspect in degrees azimuth
    - Slope: Slope in degrees
    - Horizontal Distance To Hydrology: Horz Dist to nearest surface water features 
    - Vertical Distance To Hydrology: Vert Dist to nearest surface water features
    - Horizontal Distance To Roadways: Horz Dist to nearest roadway
    - Hillshade 9am (0 to 255 index): Hillshade index at 9am, summer solstice
    - Hillshade Noon (0 to 255 index): Hillshade index at noon, summer solstice
    - Hillshade 3pm (0 to 255 index): Hillshade index at 3pm, summer solstice
    - Horizontal Distance To Fire Points: Horz Dist to nearest wildfire ignition points
- In https://github.com/adamszabunio/Forest_Cover_Type/tree/master/EDA/Further_EDA_and_Logistic_Regression.ipynb, the goal was to correctly classify only one tree type (Krummholz)
    - From logistic regression it was found that nearly all of the features (unscaled) were statistically significant. 

In [9]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
# using only 10 trees, we are able to predict with a very high accuracy
clf.score(X_test, y_test)

0.9376147420598494

In [11]:
y_pred = clf.predict(X_test)
y_pred

array([2, 2, 1, ..., 2, 2, 7], dtype=int32)

In [12]:
print(classification_report(y_test, y_pred, labels=None, target_names=None, sample_weight=None, digits=2))

             precision    recall  f1-score   support

          1       0.93      0.94      0.94     63552
          2       0.94      0.95      0.95     84991
          3       0.91      0.95      0.93     10726
          4       0.90      0.81      0.85       824
          5       0.93      0.69      0.80      2848
          6       0.91      0.84      0.87      5210
          7       0.97      0.92      0.95      6153

avg / total       0.94      0.94      0.94    174304



### Handcoded Random Forest Algorithim 

In [13]:
%run random_forests.py

In [18]:
rf = RandomForest(num_trees=10, num_features=7) # impurity_criterion='entropy' (default)
rf.fit(X_train,y_train)

In [19]:
rf.score(X_test,y_test)

0.52457774921975397

In [22]:
rf = RandomForest(num_trees=10, num_features=7, impurity_criterion='gini')
rf.fit(X_train,y_train)

In [23]:
rf.score(X_test,y_test)

0.50214567651918485

___