In [1]:
import pandas as pd

features = pd.read_csv('temps_extended.txt')

In [2]:
features.head()

Unnamed: 0,year,month,day,weekday,ws_1,prcp_1,snwd_1,temp_2,temp_1,average,actual,friend
0,2011,1,1,Sat,4.92,0.0,0,36,37,45.6,40,40
1,2011,1,2,Sun,5.37,0.0,0,37,40,45.7,39,50
2,2011,1,3,Mon,6.26,0.0,0,40,39,45.8,42,42
3,2011,1,4,Tues,5.59,0.0,0,39,42,45.9,38,59
4,2011,1,5,Wed,3.8,0.03,0,42,38,46.0,45,39


## Data Preparation

In [3]:
##OneHotEncoding
features = pd.get_dummies(features)

##Extract features and labels
labels = features['actual']
features = features.drop('actual', axis = 1)

##List of features for later use
feature_list = list(features.columns)

##Convert to numpy arrays
import numpy as np

features = np.array(features)
labels = np.array(labels)

##Training and Testing Sets
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, 
                                                                            test_size = 0.25, random_state = 42)

In [4]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (1643, 17)
Training Labels Shape: (1643,)
Testing Features Shape: (548, 17)
Testing Labels Shape: (548,)


In [5]:

print('{:0.1f} years of data in the training set'.format(train_features.shape[0] / 365.))
print('{:0.1f} years of data in the test set'.format(test_features.shape[0] / 365.))

4.5 years of data in the training set
1.5 years of data in the test set


## Restrict to Most Important feature
These were the six features required to reach a total feature importance of 95% in the first improving random forest notebook. We will use only these features in order to speed up the model.

In [6]:
##Names of five importances accounting for 95% of total importance
important_feature_names = ['temp_1', 'average', 'ws_1', 'temp_2', 'friend', 'year']

# Find the columns of the most important features
important_indices = [feature_list.index(feature) for feature in important_feature_names]

# Create training and testing sets with only the important features
important_train_features = train_features[:, important_indices]
important_test_features = test_features[:, important_indices]

# Sanity check on operations
print('Important train features shape:', important_train_features.shape)
print('Important test features shape:', important_test_features.shape)

Important train features shape: (1643, 6)
Important test features shape: (548, 6)


In [7]:
# Use only the most important features
train_features = important_train_features[:]
test_features = important_test_features[:]

# Update feature list for visualizations
feature_list = important_feature_names[:]

## Examine the Default Random Forest to Determine Parameters

We will use these parameters as a starting point. The sklearn random forest documentation to determine which features to change and the available options.

In [9]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)

from pprint import pprint

##Look at parameters used by our current forest
print('Parameters currently in used:\n')
pprint(rf.get_params())

Parameters currently in used:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


## Cross Validation

In [10]:
from sklearn.model_selection import RandomizedSearchCV

##Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200,
                                           stop = 2000,
                                           num = 10)]

##Number of features to consider at every split
max_features = ['auto','sqrt']

##Minimum number of levels in tree
max_depth = [int(x) for x in np.linspace(10,100,num=10)]
max_depth.append(None)

##Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

##Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
##Method of selecting samples for training each tree
bootstrap = [True, False]

##Create random grid
random_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap}

In [11]:
##Use the random grid to search for best hyperparameters
##First create the base model to tune
rf = RandomForestRegressor()

##Random search of parameters, using 3 fold cross validation, 
##search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1)

##Fit the random search model
rf_random.fit(train_features, train_labels)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 13.3min finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, scoring='neg_mean_absolute_error',
                   verbose=2)

In [12]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

## Evaluation function

In [15]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('**** Model Performance ****')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))

### Evaluate Default Model

In [16]:
base_model = RandomForestRegressor(n_estimators = 1000, random_state = 42)
base_model.fit(train_features, train_labels)
evaluate(base_model, test_features, test_labels)

**** Model Performance ****
Average Error: 3.8212 degrees.
Accuracy = 93.56%.


## Evaluate the Best Random Search Model

In [17]:
best_random = rf_random.best_estimator_
evaluate(best_random, test_features, test_labels)

**** Model Performance ****
Average Error: 3.7356 degrees.
Accuracy = 93.70%.


## Grid Search
