Loading Data, the aim is to predict the maximum temperature tomorrow (in Seattle, WA) from past historical data. 



In [None]:
import pandas as pd

# Read in data as a dataframe
features = pd.read_csv('weather.csv')

Preparing Data

In [None]:
# One Hot Encoding
features = pd.get_dummies(features)

# taking features and labels, the actual column is taken as the target variable the actual column represents the temperature of each of the days 
labels = features['actual']
features = features.drop('actual', axis = 1)

Taking 6 features 

In [None]:
# Names of six features 
feature_names = ['temp_1', 'average', 'ws_1', 'temp_2', 'friend', 'year']

# Update feature list for visualizations
feature_list = feature_names[:]

features = features[feature_names]
features.head(5)

Unnamed: 0,temp_1,average,ws_1,temp_2,friend,year
0,37,45.6,4.92,36,40,2011
1,40,45.7,5.37,37,50,2011
2,39,45.8,6.26,40,42,2011
3,42,45.9,5.59,39,59,2011
4,38,46.0,3.8,42,39,2011


In [None]:
#showing the respective labels
labels[1:6]

1    39
2    42
3    38
4    45
5    49
Name: actual, dtype: int64

Splitting the data

In [None]:
# Convert to numpy arrays
import numpy as np

features = np.array(features)
labels = np.array(labels)

# Training and Testing Sets
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, 
                                                                            test_size = 0.25, random_state = 42)

Determining the parameters with the default Random Forest. Check this link further for the randomforest documentation. [sklearn RandomForest documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 42)

from pprint import pprint

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


Using RandomizedSearchCV library to conduct Random Search with Cross Validation

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [None]:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(train_features, train_labels);

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 13.6min finished


In [None]:
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 70,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 400}

In [None]:
rf_random.cv_results_

{'mean_fit_time': array([ 1.36873047,  6.45824512,  3.85716931, 11.65822633,  4.72075899,
         2.48739632,  3.52991271,  2.20145067,  8.24225203, 10.10809557,
         1.81894159,  2.95375927,  6.1238416 ,  5.0874126 ,  2.51561181,
        10.38118116,  4.03448963,  3.77692191, 10.81287718,  3.28676701,
         5.85014701,  5.85147325,  4.21050723,  9.609869  ,  5.15317488,
         1.71816794, 11.47398225,  3.19371565, 11.21151535,  3.94725839,
         3.6139369 ,  2.86166533,  2.93094873,  2.34638127,  5.39286526,
         2.50564098,  3.90278459,  4.91975458,  4.14413261,  3.30023352,
         0.90969181,  1.82748723,  2.43446875,  1.16780972,  1.34982983,
         3.7832768 , 10.75622233,  1.18667944,  3.65622711,  2.13410807,
        14.75531602, 11.9776276 ,  5.61034274,  5.77496791,  4.85282779,
         1.83181469,  8.11795362,  9.56673837,  8.58570798,  4.22279032,
         0.75601617,  1.18793011,  5.5937113 ,  7.22092827,  5.74116476,
        10.44620426, 10.1018761 , 

Evaluating the result

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
#Evaluating the default model
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(train_features, train_labels)
base_accuracy = evaluate(base_model, test_features, test_labels)

Model Performance
Average Error: 3.9170 degrees.
Accuracy = 93.36%.


In [None]:
#Evaluating the best random model
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_features, test_labels)

Model Performance
Average Error: 3.7159 degrees.
Accuracy = 93.73%.


Using Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Create a base model
rf = RandomForestRegressor(random_state = 42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, return_train_score=True)

In [None]:
# Fit the grid search to the data
grid_search.fit(train_features, train_labels);

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed: 10.3min finished


In [None]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 100}

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, test_features, test_labels)

Model Performance
Average Error: 3.6565 degrees.
Accuracy = 93.83%.
