Random Forest model tutorial from [this article](https://towardsdatascience.com/random-forest-in-python-24d0893d51c0).

# Module Import

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import RocCurveDisplay
from sklearn.tree import export_graphviz
import pydot
import matplotlib.pyplot as plt
%matplotlib inline

# Data Import

In [None]:
file = ''

Read in our sample data and verify shape.

In [None]:
df = pd.read_csv(file)
df.head(5)

In [None]:
print('The shape of our features is:', df.shape)

Print descriptive statistics for each column

In [None]:
df.describe()

# Features and Labels

In [None]:
# Get labels
labels = np.array(df.iloc[:, 0])

# Get features
features= df.drop(columns=df.columns[0], axis=1, inplace=True)
feature_list = list(features.columns)
features = np.array(features)

# Train-test split
train_x, test_x, train_y, test_y = train_test_split(features, labels,
                                   test_size = 0.25, random_state = 42)

print('Training Features Shape:', train_x.shape)
print('Training Labels Shape:', train_y.shape)
print('Testing Features Shape:', test_x.shape)
print('Testing Labels Shape:', test_y.shape)

# Model Training

In [None]:
rf = RandomForestRegressor(n_estimators = 100, criterion = 'mse', max_depth = None, 
                           min_samples_split = 2, min_samples_leaf = 1)
rf.fit(train_x, train_y); 

# Results and Evaluation

In [None]:
y_hat = rf.predict(test_x)

errors = abs(y_hat - test_y)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

mape = 100 * (errors / test_y)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
tree = rf.estimators_[0]
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

(graph, ) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png'); 

print('The depth of this tree is:', tree.tree_.max_depth)

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

## Two Most Important Feature Evaluation?
(may remove later)

In [None]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Extract the two most important features
important_indices = [feature_list.index('temp_1'), feature_list.index('average')]
train_important = train_x[:, important_indices]
test_important = test_x[:, important_indices]

# Train rf
rf_most_important.fit(train_important, train_y)
predictions = rf_most_important.predict(test_important)

errors = abs(predictions - test_y)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

mape = np.mean(100 * (errors / test_labels))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

# Visualization

In [None]:
# Set the style
plt.style.use('fivethirtyeight')

x_values = list(range(len(importances)))
plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation='vertical')
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances'); 

In [None]:
ax = plt.gca()
rf_disp = RocCurveDisplay.from_estimator(rf, test_x, test_y, ax=ax, alpha=0.8)
plt.show()