# Improved Random Forest Model

## Three Approaches to Making a Better ML Model

1. More high-quality data.
2. Hyperparameter tuning of algorithm.
3. Trying different algorithm.

## Starting from Previous Model

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Reading in data as pandas dataframe and display first 5 rows
original_features = pd.read_csv('./temps.csv')
original_features = pd.get_dummies(original_features)

# Labels are the values we want to predict
original_labels = np.array(original_features['actual'])

# Removing the labels from the features
original_features = original_features.drop('actual', axis=1)  # axis 1 refers to the columns


# Saving feature names for later use
original_feature_list = list(original_features.columns)

# Converting to numpy array
original_features = np.array(original_features)

# Splitting the data into training and testing sets
original_train_features, original_test_features, original_train_labels, original_test_labels = train_test_split(original_features, original_labels, test_size=0.25, random_state=42)

# The baseline predictions are the historical averages
baseline_preds = original_test_features[:, original_feature_list.index('average')]

# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - original_test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2), 'degrees.')

rf = RandomForestRegressor(n_estimators=1000, random_state=42)

# Training the model on training data
rf.fit(original_train_features, original_train_labels)

# Using the forest's predict method on the test data
predictions = rf.predict(original_test_features)

# Calculating the absolute errors
errors = abs(predictions - original_test_labels)

# Printing out the mean absolute error (mae)
print('Average model error:', round(np.mean(errors), 2), 'degrees.')

# Comparing to baseline
improvement_baseline = 100 * abs(np.mean(errors) - np.mean(baseline_errors)) / np.mean(baseline_errors)
print('Improvement over baseline:', round(improvement_baseline, 2), '%.')

# Calculating mean absolute percentage error (MAPE)
mape = 100 * (errors / original_test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Average baseline error:  5.06 degrees.
Average model error: 3.87 degrees.
Improvement over baseline: 23.45 %.
Accuracy: 93.93 %.
