https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

In [1]:
# Pandas is used for data manipulation
import pandas as pd# Read in data and display first 5 rows
features = pd.read_csv('temps1.csv')
features.head(5)

Unnamed: 0,STATION,NAME,DATE,PRCP,TMAX,TMIN
0,GME00122362,"FRANKFURT MAIN WESTEND, GM",2017-01-01,0.2,-1.3,-4.1
1,GME00122362,"FRANKFURT MAIN WESTEND, GM",2017-01-02,0.7,3.1,-1.3
2,GME00122362,"FRANKFURT MAIN WESTEND, GM",2017-01-03,0.2,3.8,-0.2
3,GME00122362,"FRANKFURT MAIN WESTEND, GM",2017-01-04,2.3,5.5,1.3
4,GME00122362,"FRANKFURT MAIN WESTEND, GM",2017-01-05,0.0,2.4,-4.7


Data from:
https://www.ncdc.noaa.gov/cdo-web/

In [2]:
features.describe()

Unnamed: 0,PRCP,TMAX,TMIN
count,1461.0,1461.0,1461.0
mean,1.599179,16.834908,8.307255
std,3.916376,8.936208,6.304169
min,0.0,-2.7,-9.3
25%,0.0,9.5,3.5
50%,0.0,16.6,8.1
75%,1.3,24.2,13.3
max,59.7,40.2,23.9


In [3]:
features.dtypes

STATION     object
NAME        object
DATE        object
PRCP       float64
TMAX       float64
TMIN       float64
dtype: object

In [4]:
features['DATE'] = pd.to_datetime(features['DATE'])

In [5]:
features['day'] = features['DATE'].dt.day
features['month'] = features['DATE'].dt.month
features['year'] = features['DATE'].dt.year

In [6]:
import numpy as np
# make features cyclical
features['day_sin'] = np.sin(features['day']*(2.*np.pi/31))
features['day_cos'] = np.cos(features['day']*(2.*np.pi/31))
features['month_sin'] = np.sin(features['month']*(2.*np.pi/12))
features['month_cos'] = np.cos(features['month']*(2.*np.pi/12))
features

Unnamed: 0,STATION,NAME,DATE,PRCP,TMAX,TMIN,day,month,year,day_sin,day_cos,month_sin,month_cos
0,GME00122362,"FRANKFURT MAIN WESTEND, GM",2017-01-01,0.2,-1.3,-4.1,1,1,2017,2.012985e-01,0.979530,5.000000e-01,0.866025
1,GME00122362,"FRANKFURT MAIN WESTEND, GM",2017-01-02,0.7,3.1,-1.3,2,1,2017,3.943559e-01,0.918958,5.000000e-01,0.866025
2,GME00122362,"FRANKFURT MAIN WESTEND, GM",2017-01-03,0.2,3.8,-0.2,3,1,2017,5.712682e-01,0.820763,5.000000e-01,0.866025
3,GME00122362,"FRANKFURT MAIN WESTEND, GM",2017-01-04,2.3,5.5,1.3,4,1,2017,7.247928e-01,0.688967,5.000000e-01,0.866025
4,GME00122362,"FRANKFURT MAIN WESTEND, GM",2017-01-05,0.0,2.4,-4.7,5,1,2017,8.486443e-01,0.528964,5.000000e-01,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,GME00122362,"FRANKFURT MAIN WESTEND, GM",2020-12-27,6.1,5.3,0.4,27,12,2020,-7.247928e-01,0.688967,-2.449294e-16,1.000000
1457,GME00122362,"FRANKFURT MAIN WESTEND, GM",2020-12-28,4.5,6.2,2.7,28,12,2020,-5.712682e-01,0.820763,-2.449294e-16,1.000000
1458,GME00122362,"FRANKFURT MAIN WESTEND, GM",2020-12-29,0.5,5.3,1.9,29,12,2020,-3.943559e-01,0.918958,-2.449294e-16,1.000000
1459,GME00122362,"FRANKFURT MAIN WESTEND, GM",2020-12-30,0.0,5.5,2.2,30,12,2020,-2.012985e-01,0.979530,-2.449294e-16,1.000000


cyclical method from: http://blog.davidkaleko.com/feature-engineering-cyclical-features.html

In [7]:
labels = np.array(features['TMAX'])

In [8]:
features = features.drop(['STATION', 'NAME', 'TMAX', 'TMIN', 'DATE', 'day', 'month'], axis=1)
features

Unnamed: 0,PRCP,year,day_sin,day_cos,month_sin,month_cos
0,0.2,2017,2.012985e-01,0.979530,5.000000e-01,0.866025
1,0.7,2017,3.943559e-01,0.918958,5.000000e-01,0.866025
2,0.2,2017,5.712682e-01,0.820763,5.000000e-01,0.866025
3,2.3,2017,7.247928e-01,0.688967,5.000000e-01,0.866025
4,0.0,2017,8.486443e-01,0.528964,5.000000e-01,0.866025
...,...,...,...,...,...,...
1456,6.1,2020,-7.247928e-01,0.688967,-2.449294e-16,1.000000
1457,4.5,2020,-5.712682e-01,0.820763,-2.449294e-16,1.000000
1458,0.5,2020,-3.943559e-01,0.918958,-2.449294e-16,1.000000
1459,0.0,2020,-2.012985e-01,0.979530,-2.449294e-16,1.000000


In [9]:
feature_list = list(features.columns)

In [10]:
features = np.array(features)

In [11]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [12]:
# test the error on predicting with average
mean_error = round(np.mean(abs(test_labels - np.mean(test_labels))), 3)
print("The mean error if we just predicted the average temperature everyday would be:", mean_error, "°C")

The mean error if we just predicted the average temperature everyday would be: 7.696 °C


In [13]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=1000, random_state=42)
rf.fit(train_features, train_labels)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [14]:
predictions = rf.predict(test_features)
errors = abs(predictions - test_labels)
print('Mean Absolute Error:', round(np.mean(errors), 2), '°C.')

Mean Absolute Error: 2.7 °C.


In [15]:
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 69.55 %.


In [16]:
rf_small = RandomForestRegressor(n_estimators=1500, max_depth=5)
rf_small.fit(train_features, train_labels)

predictions_small = rf_small.predict(test_features)
errors_small = abs(predictions_small - test_labels)
print('Mean Absolute Error:', round(np.mean(errors_small), 2), '°C.')

Mean Absolute Error: 3.06 °C.


In [17]:
mape_small = 100 * (errors_small / test_labels)
# Calculate and display accuracy
accuracy_small = 100 - np.mean(mape_small)
print('Accuracy:', round(accuracy_small, 2), '%.')

Accuracy: 63.88 %.


In [18]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: month_cos            Importance: 0.53
Variable: month_sin            Importance: 0.24
Variable: day_sin              Importance: 0.07
Variable: year                 Importance: 0.06
Variable: PRCP                 Importance: 0.05
Variable: day_cos              Importance: 0.05
