# Random Forest Model for Occupancy Prediction in an Office Building

## Import Modules

In [None]:
## Data Analytics

import pandas as pd
import numpy as np
import datetime
import random


## Plots

# Import matplotlib and seaborn for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt
%matplotlib inline
# Set the style for plots
plt.style.use('fivethirtyeight')
import seaborn as sns
sns.set_theme(style='whitegrid', font='Arial', rc={'figure.figsize':(10,5),
            'font.size':14,
            'axes.titlesize':16,
            'axes.labelsize':15,
            'xtick.labelsize': 12,
            'ytick.labelsize': 12,
            'legend.fontsize': 13},color_codes=True)
# Pydot is used for visualization
import pydot


## Machine Learning

# Skicit-learn
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Cross Validation
from sklearn.model_selection import TimeSeriesSplit
# Import the model that is used - Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier 
# Import tools needed for visualization
from sklearn.tree import export_graphviz
# Import function to calculate accuracy
from sklearn.metrics import accuracy_score
# Confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
# Precision and recall
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
# Import autocorrelation function
from statsmodels.graphics.tsaplots import plot_acf
# Import Matthews Correlation Coefficient
from sklearn.metrics import matthews_corrcoef



# CSV
import csv

## Functions

In [None]:
# Function 1
def predict_with_model(fitted_model, X_val, y_val, output=True):
    '''
    Function to make a prediction based on a trained model, returns metrics for classification
    
    Inputs: 
    fitted_model: On a training set fitted model to use for prediction 
    X_val (np.array): numpy array with feature vectors of validation (or test) data
    y_val (np.array): numpy array with true labels of validation(or test) data
    output (bool): if output should be printed or not, default is True
   
    Outputs:
    y_pred (np.array): Prediction for feature vectors of validation (or test) data of the model
    metrics_dict (dict): Dictionary with metrics for classification
    accuracy (float): Accuracy of the prediction
    precision (float): Precision of the prediction
    recall (float): Recall of the prediction
    f1 (float): F1-Score of the prediction
    mcc (float): Matthews Correlation Coefficient of the prediction
    '''  
    
    # Use the trained model to make predictions on the validation (or test) set
    y_pred = fitted_model.predict(X_val)
    
    ## Metrics for classification
    # Accuracy
    accuracy = accuracy_score(y_val, y_pred)
    # Precision
    precision = precision_score(y_val, y_pred)
    # Recall
    recall = recall_score(y_val, y_pred)
    # F1-Score
    f1 = f1_score(y_val, y_pred)
    # MCC
    mcc = matthews_corrcoef(y_val, y_pred)
    
        
    metrics_dict = {'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                   }
    
    if output:
        print('Mean of true labels:', round(np.mean(y_val), 2))
        print('Accuracy:', accuracy)
        print('Precision:', precision)
        print('Recall:', recall)
        print('F1-Score:', f1)
        print('MCC:', mcc)
            
    return y_pred, metrics_dict, accuracy, precision, recall, f1, mcc

In [None]:
# Function 6
def calculate_feature_importances(model, features, output=True):
    '''
    Function to calculate feature importances of a random forest model
    
    Inputs:
    model: on a training set fitted and on a test set tested model
    features (lst): list containing the names of the features
    output: print results True or False (True by default)
    
    Outputs:
    importances (lst): list with all importances for the features but not sorted (same order as features in input)
    df_feature_importances (df): Dataframe with features and their corresponding importance sorted from highest to lowest
    '''
    # Get numerical feature importances
    importances = list(model.feature_importances_)
    
    # List of tuples with feature and importance
    feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_names, importances)]
    
    # Sort the feature importances from highest to lowest
    feature_importances = sorted(feature_importances, key = lambda x:
                                x[1], reverse = True)
    
    if output:
        # Print out the feature and importances
        [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
        
    #df_feature_importances = pd.DataFrame(feature_importances, columns=['feature', 'importance'])
    
    return importances, feature_importances

In [None]:
# Function 7
def plot_roc(model, X_test, y_test):
    '''
    Function to plot Receiver Operating Curve in desired layout
    
    Inputs:
    model: on a training set fitted and on a test set tested model
    X_test (np.array): numpy array with feature vectors of test data
    y_test (np.array): numpy array with true labels of test data
    
    Outputs:
    plot of ROC
    
    '''
    model_roc_auc = roc_auc_score(y_test, model.predict(X_test))
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label= 'Random Forest '+model_name+' (area = %0.2f)' % model_roc_auc, color='darkblue')
    plt.plot([0,1], [0,1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic - '+e)
    plt.legend(loc="lower right")

In [None]:
# Function 8
def plot_prc(model, X_test, y_test, y_pred):
    '''
    Function to plot Receiver Operating Curve in desired layout
    
    Inputs:
    model: on a training set fitted and on a test set tested model
    X_test (np.array): numpy array with feature vectors of test data
    y_test (np.array): numpy array with true labels of test data
    y_pred (np.array): numpy array with predictions of test data
    
    Outputs:
    plot of Precision Recall Curve
    '''
    
    average_precision = average_precision_score(y_test, y_pred)
    print('Average precision-recall score: {0:0.4f}'.format(average_precision))
    
    disp = plot_precision_recall_curve(model, X_test, y_test, color='darkblue')
    disp.ax_.set_title('2-class Precision-Recall curve: '
                      'AP={0:0.4f}'.format(average_precision))

In [None]:
# Function 8
def plot_actual_predicted_values(X_test_vis, y_test, y_pred):
    '''
    Function to plot actual and predicted values in desired layout 
    
    Inputs:
    model: on a training set fitted and on a test set tested model
    X_test_vis (np.array): numpy array with feature vectors of test data 
                            including columns of month, day, year, hour, minute, second (not one-hot encoded)
    y_test (np.array): numpy array with true labels of test data
    y_pred (np.array): numpy array with predictions of test data
    
    Outputs:
    plot of actual an predicted values
    
    '''
    # Dates of testing data/predictions
    months = X_test_vis[:, data_list.index('Month')]
    days = X_test_vis[:, data_list.index('Day')]
    years = X_test_vis[:, data_list.index('Year')]
    hours = X_test_vis[:, data_list.index('Hour')]
    minutes = X_test_vis[:, data_list.index('Minute')]
    seconds = X_test_vis[:, data_list.index('Second')]
    
    # List and then convert to datetime object
    test_dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) + ' ' + str(int(hour)) + ':' + str(int(minute)) + ':' + str(int(second)) 
                                                                                      for year, month, day, hour, minute, second in zip(years, months, days, hours, minutes, seconds)]

    test_dates = [datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in test_dates]

    # Dataframe with true values and dates
    true_data = pd.DataFrame(data = {'date': test_dates, 'actual': y_test})

    # Dataframe with predictions and dates
    predictions_data = pd.DataFrame(data = {'date': test_dates, 'prediction': y_pred})
    
    # Plot the actual values
    plt.figure(figsize=(20,5))
    plt.plot(true_data['date'], true_data['actual'],'darkblue', label = 'tatsächlicher Wert')
    #plt.plot(true_data['date'], true_data['actual'],'darkblue', label = 'actual')
    # Plot the predicted values
    plt.plot(predictions_data['date'], predictions_data['prediction'], 'ro', label = 'vorhergesagter Wert')
    #plt.plot(predictions_data['date'], predictions_data['prediction'], 'ro', label = 'prediction')
    plt.xticks(rotation = '60');
    plt.xlim(13330, 13515)
    plt.yticks([0,1])
    plt.legend()
    # Graph labels
    plt.xlabel('Datum'); plt.ylabel('Belegung'); plt.title('Tatsächliche und vorhergesagte Werte - '+e);
    #plt.xlabel('Date'); plt.ylabel('Occupancy'); plt.title('Actual and Predicted Values');
    # Dataframe with test values and dates
    testing_data = pd.DataFrame(data = {'date': test_dates, 'actual': y_test})


## Variables

In [None]:
# a = room number
a = 'E07'
# b = number of lags as input
b = '5'
# c = number of last timestep to predict (only used with for loop when multiple timesteps are calulcated)
#c = 1
# t = timesteps for seperate plots
t = [6, 12, 18, 24, 30, 36]
# d = number of acutal timestep to predict
d = 1
# e = timestep in format 't+x' as string
e = 't+'+str(d)

## Loop

In [None]:
# for d in range(0,c+1):
    
#     if d==0:
#         # Timestep as string
#         e = 't'
#     else:
#         # Timestep as string
#         e = 't+'+str(d)

## Import Data

In [None]:
# Read in data and set index
raw_data = pd.read_csv(r"\Pre-Processing\data_E07_input_5_output_144.txt", parse_dates=True)
data = raw_data.copy()
data['DateTime'] = pd.to_datetime(data['DateTime'])
data = data.set_index('DateTime')

## Edit columns

In [None]:
# Drop columns for Second
data = data.drop('Second_0', axis = 1)

In [None]:
# Drop columns of future timestamps that should not be used as input for this model
if d==0:
    for i in range(1,145):
        v = 't+'+str(i)
        data = data.drop(v, axis = 1)
else:
    for i in range(d+1,145):
        v = 't+'+str(i)
        data = data.drop(v, axis = 1)

    for i in range(1, d):
        v = 't+'+str(i)
        data = data.drop(v, axis = 1)

## Data Preparation

In [None]:
# Add columns with year, month, day and weekday name at the end of the dataset for later use of visualization 
data['Year'] = data.index.year
data['Month'] = data.index.month
data['Day'] = data.index.day
data['Weekday Name'] = data.index.day_name()
data['Hour'] = data.index.hour
data['Minute'] = data.index.minute
data['Second'] = data.index.second

In [None]:
# Use numpy to convert to arrays
# Labels are the values we want to predict
labels = np.array(data[e])

In [None]:
# Remove the labels from the data
# axis 1 refers to the columns
data = data.drop(e, axis = 1)

In [None]:
# Saving data names for later use
data_list = list(data.columns)

In [None]:
# Save copy before transforming to numpy array
data_copy = data.copy()

In [None]:
# Convert to numpy array
data = np.array(data)

In [None]:
# Extract only the one hot encoded data
feature_names = [i for i in data_list if i not in ['Year', 'Month','Day','Hour','Minute','Second','Weekday Name']]
indices = [data_list.index(feature_names[x]) for x in range(0,len(feature_names))]
data_to_use = data[:, indices]

## Split Dataset

In [None]:
# Split data into training and testing sets using scikit-learn 
# 25 % so that about 6 months are covered in the test set
# shuffle = False because of time series data
train_data, test_data, train_labels, test_labels = train_test_split(data_to_use, labels, test_size = 0.25, shuffle=False)
train_data_vis, test_data_vis, train_labels_vis, test_labels_vis = train_test_split(data, labels, test_size = 0.25, shuffle=False) 

In [None]:
# Print shapes of data sets
print('Training Data Shape:', train_data.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Data Shape:', test_data.shape)
print('Testing Labels Shape:', test_labels.shape)

print('Training Data Shape:', train_data_vis.shape)
print('Training Labels Shape:', train_labels_vis.shape)
print('Testing Data Shape:', test_data_vis.shape)
print('Testing Labels Shape:', test_labels_vis.shape)

## Random Forest Modeling

### Base Model

In [None]:
# Instantiate base model with 100 decision trees
rf_base = RandomForestClassifier(n_estimators = 100, random_state=42)
# model name
model_name = 'base'

In [None]:
# Train the model on training data
rf_base.fit(train_data, train_labels)

In [None]:
# Calculate accuracy on the training set
accuracy_train_rf_base = rf_base.score(train_data, train_labels)

In [None]:
# Make predictions on test set and calculate metrics
predictions_rf_base, metrics_rf_base, accuracy_rf_base, precision_rf_base, recall_rf_base, f1_rf_base, mcc_rf_base = predict_with_model(rf_base, test_data, test_labels)

In [None]:
# Calculate feature importances
importances_rf_base, feature_importances_rf_base = calculate_feature_importances(rf_base, feature_names, output=False)

#### Visualizations

In [None]:
## Feature importances

# adapting for each model
importances = importances_rf_base
features = feature_names

# list of x locations for plotting
x_values = list(range(len(importances)))
# importance in percentages
importances_percent = [element*100 for element in importances]
# Make a bar chart
plt.figure(figsize=(17,5))
plt.bar(x_values, importances_percent, orientation = 'vertical', color='darkblue', align='center')
# Tick Labels for x axis
plt.xticks(x_values, features, rotation = 'vertical')
if d==0:
    plt.xlim(-1,91)
else:
    plt.xlim(-1, 92)
    # Axis labels and title
plt.ylabel('Einfluss in %'); plt.xlabel('Merkmal'); plt.title('Einfluss der Merkmale auf die Vorhersage - ' + e)
#plt.ylabel('Importance in %'); plt.xlabel('Feature'); plt.title('Feature Importances - ' + e);

plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_feature_importances.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
# some seperate plots
if d in t:
        
    # *Feature importances*
    # list of x locations for plotting
    x_values = list(range(len(importances)))
    # importance in percentages
    importances_percent = [element*100 for element in importances]
    # Make a bar chart
    plt.figure(figsize=(17,5))
    plt.bar(x_values, importances_percent, orientation = 'vertical', color='darkblue', align='center')
    # Tick Labels for x axis
    plt.xticks(x_values, indices, rotation = 'vertical')
    plt.xlim(-1, 92)
    plt.ylim(0, 12)
    # Axis labels and title
    plt.ylabel('Einfluss in %'); plt.xlabel('Merkmal'); plt.title('Einfluss der Merkmale auf die Vorhersage - ' + e)
    #plt.ylabel('Importance in %'); plt.xlabel('Feature'); plt.title('Feature Importances - ' + e);
    plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_feature_importances_additional.pdf',bbox_inches='tight', dpi=100)
    plt.close()

In [None]:
# Actual and predicted values
plot_actual_predicted_values(test_data_vis, test_labels, predictions_rf_base)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_actual_and_predicted_values.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
# Autocorrelation
plot_acf(predictions_rf_base)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_autocorrelation.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
## Confusion Matrix

print(confusion_matrix(test_labels, predictions_rf_base))
# Plot confusion matrix
np.set_printoptions(precision=2)


#titles_options = [("Konfusionsmatrix ohne Normalisierung", None),
#                     ("Normalisierte Konfusionsmatrix", 'true')]
titles_options = [("Confusion matrix without normalization", None),
                     ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(rf_base, test_data, test_labels,
                                    cmap=plt.cm.Blues,
                                    normalize=normalize)
    disp.ax_.set_title(title)
    plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_'+title+'.pdf',bbox_inches='tight', dpi=100)
    plt.close()

In [None]:
# Precision Recall Curve

plot_prc(rf_base, test_data, test_labels, predictions_rf_base)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_precision_recall_curve.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
# ROC
plot_roc(rf_base, test_data, test_labels)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_ROC.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
## Cumulative Importances    

# List of features sorted from most to least importanct 
sorted_importances = [importance[1] for importance in feature_importances_rf_base]
sorted_features = [importance[0] for importance in feature_importances_rf_base]

# Cumulative importances
cumulative_importances = np.cumsum(sorted_importances)
# percentage of cumulate importance
cumulative_importances_percent = [element*100 for element in cumulative_importances]
# Make a line graph
f = plt.subplots(figsize=(17,5))
plt.plot(x_values, cumulative_importances_percent, 'darkblue', linestyle=' ', marker='.')
# Draw line at 95% of importance retained
plt.hlines(y=95, xmin=-1, xmax=len(sorted_importances), color='darkgrey', linestyles = '-')
plt.hlines(y=90, xmin=-1, xmax=len(sorted_importances), color='darkgrey', linestyles = '-')
# Format x ticks and labels
plt.xticks(x_values, sorted_features, rotation = 'vertical')
if d==0:
    plt.xlim(-1, 91)
else:
    plt.xlim(-1, 92)
# Axis labels and title
plt.xlabel('Merkmal'); plt.ylabel('Kumulierter Einfluss in %');
plt.title('Kumulierter Einfluss der Merkmale - '+e)
#plt.xlabel('Feature'); plt.ylabel('Cumulative Importance in %');
#plt.title('Cumulative Importances - '+e)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_cumulative_importances.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
# Find number of features for cumulative importance of 95%
# Add 1 because Python is zero-indexed
cumulative_importance_95 = (np.where(cumulative_importances > 0.95) [0][0]+1)
print('Number of features for 95% importance: ',
         cumulative_importance_95)
# Find number of features for cumulative importance of 90%
# Add 1 because Python is zero-indexed
cumulative_importance_90 = (np.where(cumulative_importances > 0.90) [0][0]+1)
print('Number of features for 90% importance: ',
         cumulative_importance_90)

#### Save results

In [None]:
# Format for saving importances
header = []
for k in ['1st Feature', '1st Importance', '2nd Feature', '2nd Importane', '3rd Feature', '3rd Importance']:
    header.append(k)
    
for i in range (4,95):
    s = str(i)
    k = s+'th'
    header.append(k+' Feature')
    header.append(k+' Importance')

header.insert(0, 'Timestep')

lst1 = [k for (k, v) in feature_importances_rf_base]
lst2 = [v for (k, v) in feature_importances_rf_base]

line = [x for y in zip(lst1, lst2) for x in y]
line.insert(0, e)

In [None]:
if d==1:
        # Creating csv file with results
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_results.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Timestep','Accuracy - Training Set', 'Accuracy - Test Set','Precision', 'Recall','F1-Score', 'Matthews Correlation Coefficient', '95%', '90%'])
            writer.writerow([e, accuracy_train_rf_base, accuracy_rf_base, precision_rf_base, recall_rf_base, f1_rf_base, mcc_rf_base, cumulative_importance_95, cumulative_importance_90])
        # Creating csv file with results of importances
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_feature_importances.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(header)
            writer.writerow(line)
else:    
        # Appending results to csv file
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_results.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([e, accuracy_train_rf_base, accuracy_rf_base, precision_rf_base, recall_rf_base, f1_rf_base, mcc_rf_base, cumulative_importance_95, cumulative_importance_90])
        # Appending results of importances to csv file
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_feature_importances.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(line)

### Model With Tuned Hyperparameter

In [None]:
# Instantiate model with tuned hyperparameter
rf_hyp = RandomForestClassifier(n_estimators = 100, max_depth = 60, min_samples_split = 5, min_samples_leaf = 1, random_state = 42)
# model name
model_name = 'hyp'

In [None]:
# Train the model on training data
rf_hyp.fit(train_data, train_labels)

In [None]:
# Calculate accuracy on the training set
accuracy_train_rf_hyp = rf_hyp.score(train_data, train_labels)

In [None]:
# Make predictions on test set and calculate metrics
predictions_rf_hyp, metrics_rf_hyp, accuracy_rf_hyp, precision_rf_hyp, recall_rf_hyp, f1_rf_hyp, mcc_rf_hyp = predict_with_model(rf_hyp, test_data, test_labels)

In [None]:
# Calculate feature importances
importances_rf_hyp, feature_importances_rf_hyp = calculate_feature_importances(rf_hyp, feature_names, output=False)

#### Visualizations

In [None]:
## Feature importances

# adapting for each model
importances = importances_rf_hyp
features = feature_names

# list of x locations for plotting
x_values = list(range(len(importances)))
# importance in percentages
importances_percent = [element*100 for element in importances]
# Make a bar chart
plt.figure(figsize=(17,5))
plt.bar(x_values, importances_percent, orientation = 'vertical', color='darkblue', align='center')
# Tick Labels for x axis
plt.xticks(x_values, features, rotation = 'vertical')
if d==0:
    plt.xlim(-1,91)
else:
    plt.xlim(-1, 92)
    # Axis labels and title
plt.ylabel('Einfluss in %'); plt.xlabel('Merkmal'); plt.title('Einfluss der Merkmale auf die Vorhersage - ' + e)
#plt.ylabel('Importance in %'); plt.xlabel('Feature'); plt.title('Feature Importances - ' + e);

plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_feature_importances.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
# some seperate plots
if d in t:
        
    # *Feature importances*
    # list of x locations for plotting
    x_values = list(range(len(importances)))
    # importance in percentages
    importances_percent = [element*100 for element in importances]
    # Make a bar chart
    plt.figure(figsize=(17,5))
    plt.bar(x_values, importances_percent, orientation = 'vertical', color='darkblue', align='center')
    # Tick Labels for x axis
    plt.xticks(x_values, indices, rotation = 'vertical')
    plt.xlim(-1, 92)
    plt.ylim(0, 12)
    # Axis labels and title
    plt.ylabel('Einfluss in %'); plt.xlabel('Merkmal'); plt.title('Einfluss der Merkmale auf die Vorhersage - ' + e)
    #plt.ylabel('Importance in %'); plt.xlabel('Feature'); plt.title('Feature Importances - ' + e);
    plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_feature_importances_additional.pdf',bbox_inches='tight', dpi=100)
    plt.close()

In [None]:
# Actual and predicted values
plot_actual_predicted_values(test_data_vis, test_labels, predictions_rf_hyp)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_actual_and_predicted_values.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
# Autocorrelation
plot_acf(predictions_rf_hyp)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_autocorrelation.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
## Confusion Matrix

print(confusion_matrix(test_labels, predictions_rf_hyp))
# Plot confusion matrix
np.set_printoptions(precision=2)


#titles_options = [("Konfusionsmatrix ohne Normalisierung", None),
#                     ("Normalisierte Konfusionsmatrix", 'true')]
titles_options = [("Confusion matrix without normalization", None),
                     ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(rf_hyp, test_data, test_labels,
                                    cmap=plt.cm.Blues,
                                    normalize=normalize)
    disp.ax_.set_title(title)
    plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_'+title+'.pdf',bbox_inches='tight', dpi=100)
    plt.close()

In [None]:
# Precision Recall Curve

plot_prc(rf_hyp, test_data, test_labels, predictions_rf_hyp)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_precision_recall_curve.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
# ROC
plot_roc(rf_hyp, test_data, test_labels)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_ROC.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
## Cumulative Importances  

# List of features sorted from most to least importance 
sorted_importances = [importance[1] for importance in feature_importances_rf_hyp]
sorted_features = [importance[0] for importance in feature_importances_rf_hyp]

# Cumulative importances
cumulative_importances = np.cumsum(sorted_importances)
# percentage of cumulate importance
cumulative_importances_percent = [element*100 for element in cumulative_importances]
# Make a line graph
f = plt.subplots(figsize=(17,5))
plt.plot(x_values, cumulative_importances_percent, 'darkblue', linestyle=' ', marker='.')
# Draw line at 95% of importance retained
plt.hlines(y=95, xmin=-1, xmax=len(sorted_importances), color='darkgrey', linestyles = '-')
plt.hlines(y=90, xmin=-1, xmax=len(sorted_importances), color='darkgrey', linestyles = '-')
# Format x ticks and labels
plt.xticks(x_values, sorted_features, rotation = 'vertical')
if d==0:
    plt.xlim(-1, 91)
else:
    plt.xlim(-1, 92)
# Axis labels and title
plt.xlabel('Merkmal'); plt.ylabel('Kumulierter Einfluss in %');
plt.title('Kumulierter Einfluss der Merkmale - '+e)
#plt.xlabel('Feature'); plt.ylabel('Cumulative Importance in %');
#plt.title('Cumulative Importances - '+e)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_cumulative_importances.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
# Find number of features for cumulative importance of 95%
# Add 1 because Python is zero-indexed
cumulative_importance_95 = (np.where(cumulative_importances > 0.95) [0][0]+1)
print('Number of features for 95% importance: ',
         cumulative_importance_95)
# Find number of features for cumulative importance of 90%
# Add 1 because Python is zero-indexed
cumulative_importance_90 = (np.where(cumulative_importances > 0.90) [0][0]+1)
print('Number of features for 90% importance: ',
         cumulative_importance_90)

#### Save results

In [None]:
# Format for saving importances
header = []
for k in ['1st Feature', '1st Importance', '2nd Feature', '2nd Importane', '3rd Feature', '3rd Importance']:
    header.append(k)
    
for i in range (4,95):
    s = str(i)
    k = s+'th'
    header.append(k+' Feature')
    header.append(k+' Importance')

header.insert(0, 'Timestep')

lst1 = [k for (k, v) in feature_importances_rf_hyp]
lst2 = [v for (k, v) in feature_importances_rf_hyp]

line = [x for y in zip(lst1, lst2) for x in y]
line.insert(0, e)

In [None]:
if d==1:
        # Creating csv file with results
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_results.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Timestep','Accuracy - Training Set', 'Accuracy - Test Set','Precision', 'Recall','F1-Score', 'Matthews Correlation Coefficient', '95%', '90%'])
            writer.writerow([e, accuracy_train_rf_hyp, accuracy_rf_hyp, precision_rf_hyp, recall_rf_hyp, f1_rf_hyp, mcc_rf_hyp, cumulative_importance_95, cumulative_importance_90])
        # Creating csv file with results of importances
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_feature_importances.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(header)
            writer.writerow(line)
else:    
        # Appending results to csv file
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_results.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([e, accuracy_train_rf_hyp, accuracy_rf_hyp, precision_rf_hyp, recall_rf_hyp, f1_rf_hyp, mcc_rf_hyp, cumulative_importance_95, cumulative_importance_90])
        # Appending results of importances to csv file
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_feature_importances.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(line)

### Model with three most important features

In [None]:
# Instantiate model with tuned hyperparameter
rf_three = RandomForestClassifier(n_estimators = 100, random_state=42)
# model name
model_name = 'three_most_important_features'

In [None]:
# Extract the three most important features in order of importance out of tuned hyperparameter rf
sorted_features = [importance[0] for importance in feature_importances_rf_hyp]

three_indices = [data_list.index(sorted_features[0]), data_list.index(sorted_features[1]), data_list.index(sorted_features[2])]
train_three = train_data[:, three_indices]
test_three = test_data[:, three_indices]

three_feature_names = [sorted_features[0], sorted_features[1], sorted_features[2]]

In [None]:
# Train the model on training data
rf_three.fit(train_three, train_labels)

In [None]:
# Calculate accuracy on the training set
accuracy_train_rf_three = rf_three.score(train_three, train_labels)

In [None]:
# Make predictions on test set and calculate metrics
predictions_rf_three, metrics_rf_three, accuracy_rf_three, precision_rf_three, recall_rf_three, f1_rf_three, mcc_rf_three = predict_with_model(rf_three, test_three, test_labels)

In [None]:
# Calculate feature importances
importances_rf_three, feature_importances_rf_three = calculate_feature_importances(rf_three, three_feature_names, output=False)

#### Visualization

In [None]:
## Feature importances

# adapting for each model
importances = importances_rf_three
features = three_feature_names

# list of x locations for plotting
x_values = list(range(len(importances)))
importances_percent = [element*100 for element in importances]
# Make a bar chart
plt.bar(x_values, importances_percent, orientation = 'vertical', color='darkblue')
# Tick Labels for x axis
plt.xticks(x_values, features, rotation = 'vertical')
# Axis labels and title
plt.ylabel('Einfluss in %'); plt.xlabel('Merkmal'); plt.title('Einfluss der Merkmale auf die Vorhersage - '+e);
#plt.ylabel('Importance in %'); plt.xlabel('Feature'); plt.title('Feature Importances - '+e);
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_feature_importances.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
# Actual and predicted values
plot_actual_predicted_values(test_data_vis, test_labels, predictions_rf_three)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_actual_and_predicted_values.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
## Confusion Matrix

print(confusion_matrix(test_labels, predictions_rf_three))
# Plot confusion matrix
np.set_printoptions(precision=2)


#titles_options = [("Konfusionsmatrix ohne Normalisierung", None),
#                     ("Normalisierte Konfusionsmatrix", 'true')]
titles_options = [("Confusion matrix without normalization", None),
                     ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(rf_three, test_three, test_labels,
                                    cmap=plt.cm.Blues,
                                    normalize=normalize)
    disp.ax_.set_title(title)
    plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_'+title+'.pdf',bbox_inches='tight', dpi=100)
    plt.close()

#### Save results

In [None]:
if d==1:
        # Creating csv file with results
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_results.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Timestep','Accuracy - Training Set', 'Accuracy - Test Set','Precision', 'Recall','F1-Score', 'Matthews Correlation Coefficient'])
            writer.writerow([e, accuracy_train_rf_three, accuracy_rf_three, precision_rf_three, recall_rf_three, f1_rf_three, mcc_rf_three])
else:    
        # Appending results to csv file
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_results.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([e, accuracy_train_rf_three, accuracy_rf_three, precision_rf_three, recall_rf_three, f1_rf_three, mcc_rf_three])

### Model with five most important features

In [None]:
# Instantiate model with tuned hyperparameter
rf_five = RandomForestClassifier(n_estimators = 100, random_state=42)
# model name
model_name = 'five_most_important_features'

In [None]:
# Extract the five most important features in order of importance out of tuned hyperparameter rf
sorted_features = [importance[0] for importance in feature_importances_rf_hyp]

five_indices = [data_list.index(sorted_features[0]), data_list.index(sorted_features[1]), data_list.index(sorted_features[2]), data_list.index(sorted_features[3]), data_list.index(sorted_features[4])]
train_five = train_data[:, five_indices]
test_five = test_data[:, five_indices]

five_feature_names = [sorted_features[0], sorted_features[1], sorted_features[2], sorted_features[3], sorted_features[4]]

In [None]:
# Train the model on training data
rf_five.fit(train_five, train_labels)

In [None]:
# Calculate accuracy on the training set
accuracy_train_rf_five = rf_five.score(train_five, train_labels)

In [None]:
# Make predictions on test set and calculate metrics
predictions_rf_five, metrics_rf_five, accuracy_rf_five, precision_rf_five, recall_rf_five, f1_rf_five, mcc_rf_five = predict_with_model(rf_five, test_five, test_labels)

In [None]:
# Calculate feature importances
importances_rf_five, feature_importances_rf_five = calculate_feature_importances(rf_five, five_feature_names, output=False)

#### Visualization

In [None]:
## Feature importances

# adapting for each model
importances = importances_rf_five
features = five_feature_names

# list of x locations for plotting
x_values = list(range(len(importances)))
importances_percent = [element*100 for element in importances]
# Make a bar chart
plt.bar(x_values, importances_percent, orientation = 'vertical', color='darkblue')
# Tick Labels for x axis
plt.xticks(x_values, features, rotation = 'vertical')
# Axis labels and title
plt.ylabel('Einfluss in %'); plt.xlabel('Merkmal'); plt.title('Einfluss der Merkmale auf die Vorhersage - '+e);
#plt.ylabel('Importance in %'); plt.xlabel('Feature'); plt.title('Feature Importances - '+e);
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_feature_importances.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
# Actual and predicted values
plot_actual_predicted_values(test_data_vis, test_labels, predictions_rf_five)
plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_actual_and_predicted_values.pdf',bbox_inches='tight', dpi=100)
plt.close()

In [None]:
## Confusion Matrix

print(confusion_matrix(test_labels, predictions_rf_five))
# Plot confusion matrix
np.set_printoptions(precision=2)


#titles_options = [("Konfusionsmatrix ohne Normalisierung", None),
#                     ("Normalisierte Konfusionsmatrix", 'true')]
titles_options = [("Confusion matrix without normalization", None),
                     ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(rf_five, test_five, test_labels,
                                    cmap=plt.cm.Blues,
                                    normalize=normalize)
    disp.ax_.set_title(title)
    plt.savefig('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_forecast_'+e+'_'+title+'.pdf',bbox_inches='tight', dpi=100)
    plt.close()

#### Save results

In [None]:
if d==1:
        # Creating csv file with results
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_results.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Timestep','Accuracy - Training Set', 'Accuracy - Test Set','Precision', 'Recall','F1-Score', 'Matthews Correlation Coefficient'])
            writer.writerow([e, accuracy_train_rf_five, accuracy_rf_five, precision_rf_five, recall_rf_five, f1_rf_five, mcc_rf_five])
else:    
        # Appending results to csv file
        with open('Random_Forest_'+a+'_'+model_name+'_input_'+b+'_results.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([e, accuracy_train_rf_five, accuracy_rf_five, precision_rf_five, recall_rf_five, f1_rf_five, mcc_rf_five])