# Improved Random Forest Model

## Three Approaches to Making a Better ML Model

1. More high-quality data.
2. Hyperparameter tuning of algorithm.
3. Trying different algorithm.

## Starting from Previous Model

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Reading in data as pandas dataframe and display first 5 rows
original_features = pd.read_csv('./temps.csv')
original_features = pd.get_dummies(original_features)

# Labels are the values we want to predict
original_labels = np.array(original_features['actual'])

# Removing the labels from the features
original_features = original_features.drop('actual', axis=1)  # axis 1 refers to the columns


# Saving feature names for later use
original_feature_list = list(original_features.columns)

# Converting to numpy array
original_features = np.array(original_features)

# Splitting the data into training and testing sets
original_train_features, original_test_features, original_train_labels, original_test_labels = train_test_split(original_features, original_labels, test_size=0.25, random_state=42)

# The baseline predictions are the historical averages
baseline_preds = original_test_features[:, original_feature_list.index('average')]

# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - original_test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2), 'degrees.')

rf = RandomForestRegressor(n_estimators=1000, random_state=42)

# Training the model on training data
rf.fit(original_train_features, original_train_labels)

# Using the forest's predict method on the test data
predictions = rf.predict(original_test_features)

# Calculating the absolute errors
errors = abs(predictions - original_test_labels)

# Printing out the mean absolute error (mae)
print('Average model error:', round(np.mean(errors), 2), 'degrees.')

# Comparing to baseline
improvement_baseline = 100 * abs(np.mean(errors) - np.mean(baseline_errors)) / np.mean(baseline_errors)
print('Improvement over baseline:', round(improvement_baseline, 2), '%.')

# Calculating mean absolute percentage error (MAPE)
mape = 100 * (errors / original_test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Average baseline error:  5.06 degrees.
Average model error: 3.87 degrees.
Improvement over baseline: 23.45 %.
Accuracy: 93.93 %.


## Collecting More Data

In [3]:
features = pd.read_csv('./temps_extended.csv')
features.head(5)

Unnamed: 0,year,month,day,weekday,ws_1,prcp_1,snwd_1,temp_2,temp_1,average,actual,friend
0,2011,1,1,Sat,4.92,0.0,0,36,37,45.6,40,40
1,2011,1,2,Sun,5.37,0.0,0,37,40,45.7,39,50
2,2011,1,3,Mon,6.26,0.0,0,40,39,45.8,42,42
3,2011,1,4,Tues,5.59,0.0,0,39,42,45.9,38,59
4,2011,1,5,Wed,3.8,0.03,0,42,38,46.0,45,39


In [4]:
"""
- Here format(*features.shape) is used to provide values for the placeholders in the string.
- features.shape returns a tuple containing two values (here days and variables).
- The * operator before features.shape unpacks the tuple into individual arguments. This means if features.shape is (30, 5), it will be unpacked to 30, 5.
- The values obtained from features.shape (e.g., 30 and 5) are inserted into the placeholders {} in the string.
- The first {} will be replaced by the first value (30), and the second {} will be replaced by the second value (5).
"""
print('We have {} days of data with {} number of variables'.format(*features.shape))

We have 2191 days of data with 12 number of variables


## Numerical Inspection of Data

In [5]:
round(features.describe(), 2)

Unnamed: 0,year,month,day,ws_1,prcp_1,snwd_1,temp_2,temp_1,average,actual,friend
count,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0
mean,2013.5,6.52,15.71,7.37,0.12,0.01,61.17,61.18,60.29,61.18,60.31
std,1.71,3.45,8.8,3.15,0.25,0.15,13.09,13.08,10.73,13.08,15.87
min,2011.0,1.0,1.0,0.89,0.0,0.0,29.0,29.0,45.1,29.0,25.0
25%,2012.0,4.0,8.0,5.14,0.0,0.0,51.0,51.0,50.1,51.0,49.0
50%,2014.0,7.0,16.0,6.71,0.0,0.0,60.0,60.0,58.8,60.0,60.0
75%,2015.0,10.0,23.0,9.17,0.12,0.0,71.0,71.0,70.2,71.0,71.0
max,2017.0,12.0,31.0,21.25,2.2,3.0,96.0,96.0,77.4,96.0,97.0


## Inspection of Data by Visualisation