In [1]:
import os

#import libraries for data wrangling
import pandas as pd
import numpy as np

#import libraries for plotting data
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

#import random forest library


In [2]:
train_df, validation_df, test_df = pd.read_csv('data/train_fil_3.csv',index_col=0),pd.read_csv('data/validation_fil_3.csv',index_col=0),pd.read_csv('data/test_fil_3.csv',index_col=0)
for df in [train_df,validation_df,test_df]:
    df.columns = [int(col) for col in df.columns]
    df.index = pd.to_datetime(df.index)

In [3]:
    
#function to calculate MAPE for all observations where y_true is not 0
def mape(y_true, y_predict):
    '''Returns mean percentage error for all predictions where y_true is not 0. Where y_true is 0, the percentage error is 0 as well '''
    return np.mean([np.absolute(y_true[idx] - y_predict[idx])/y_true[idx] * 100 if y_true[idx] != 0 else 0 for idx,_ in enumerate(y_true) ])

def median_pe(y_true, y_predict):
    '''Returns mean percentage error for all predictions where y_true is not 0. Where y_true is 0, the percentage error is 0 as well '''
    return np.median([np.absolute(y_true[idx] - y_predict[idx])/y_true[idx] * 100 if y_true[idx] != 0 else 0 for idx,_ in enumerate(y_true) ])

def residuals(y_true,y_predict):
    '''Returns list with residuals for all observations where y_true not 0. Where y_true is 0, the residuals are 0 as well '''
    return [y_true[idx] - y_predict[idx] if y_true[idx] != 0 else 0 for idx,_ in enumerate(y_true) ]

def pct_residuals(y_true,y_predict):
    '''Returns list with percentage errors for all observations where y_true not 0. Where y_true is 0, the percentage error is 0 as well'''
    return [(y_true[idx] - y_predict[idx])/y_true[idx] * 100 if y_true[idx] != 0 else 0 for idx,_ in enumerate(y_true) ]

# Plot cumulative density function of residuals
def residual_cdf(data):
    '''Plots cdf of residual input data'''
    # sort the data:
    data_sorted = np.sort(data)

    # calculate the proportional values of samples
    p = 1. * np.arange(len(data)) / (len(data) - 1)

    # plot the sorted data:
    fig = plt.figure(figsize=(20,15))


    ax1 = fig.add_subplot(311)
    ax1.plot(data_sorted, p)
    ax1.set_title('Residuals Cumulative Distribution Function')
    ax1.set_xlabel('Residuals');
    ax1.set_ylabel('Cumulative Distribution');
    ax1.axvline(x=np.percentile(data,5),color='r') 
    ax1.axvline(x=np.percentile(data,95),color='r')

    ax2 = fig.add_subplot(312)
    ax2.plot([idx for idx,_ in enumerate(data)],data,'bo');
    ax2.plot([idx for idx,_ in enumerate(data)],np.zeros(len(data)),'r-');
    ax2.set_title('Residuals over time')
    ax2.set_xlabel('Time in days');
    ax2.set_ylabel('Residual');  
    
    #Here, we could also add Q-Q plot and auto correlation plot for the residual
    
def plot_prediction(y_true,y_predict):
    '''Plots true and predicted values on same y-axis'''
    fig = plt.figure(figsize=(20,15))
    ax1 = fig.add_subplot(311)
    ax1.plot(range(len(y_true)), y_true,'bo')
    ax1.plot(range(len(y_predict)),y_predict,'r-')
    ax1.set_title('Complete prediction')
    
    ax2 = fig.add_subplot(312)
    ax2.plot(range(len(y_true[:60])), y_true[:60],'bo')
    ax2.plot(range(len(y_predict[:60])),y_predict[:60],'r-o')
    ax2.set_title('Prediction first 60 days')
    ax2.set_ylim(0,max(y_true))
    ax3 = fig.add_subplot(313)
    ax3.plot(range(len(y_true[-60:])), y_true[-60:],'bo')
    ax3.plot(range(len(y_predict[-60:])),y_predict[-60:],'ro-')
    ax3.set_title('Prediction last 60 days')
    
def management_summary(y_true,y_predict):
    data = pd.DataFrame.from_dict({'y_true':y_true, 'y_predict':y_predict})
    
    #only regard data where y_true is not 0
    ex_0 = data[data['y_true'] != 0]
    
    #calculate how ofter we under- and over-estimate the revenue
    pct_lower = round(sum(ex_0.y_predict - ex_0.y_true < 0)/len(ex_0.y_true) * 100,1)
    pct_higher = round(100 - pct_lower,1)
    
    #calculate cumulative sums of under- and over estimation
    cumsum_lower = np.cumsum([np.abs(ex_0.y_predict[idx] - ex_0.y_true[idx]) if ex_0.y_predict[idx] < ex_0.y_true[idx] else 0 for idx,y in enumerate(ex_0.y_true) ])
    cumsum_higher = np.cumsum([np.abs(ex_0.y_predict[idx] - ex_0.y_true[idx]) if ex_0.y_predict[idx] > ex_0.y_true[idx] else 0 for idx,y in enumerate(ex_0.y_true)])

    
    fig = plt.figure(figsize=(20,15))
    ax1 = fig.add_subplot(211)
    ax1.plot(range(len(ex_0.y_true)), cumsum_lower,'b-o')
    ax1.plot(range(len(ex_0.y_true)),cumsum_higher,'r-o')
    
    ax1.set_title('Cumulative Sums of Under- and Over-Estimation')
    ax1.set_xlabel('Time')
    ax1.set_ylabel('Cumulated sum of errors')
    ax1.legend(['Under Estimation', 'Over Estimation', 'True Values'])
    
    ax2 = ax1.twinx()
    color = 'black'
    ax2.set_ylabel('Measured Values', color = color)
    ax2.plot(range(len(ex_0.y_true)),ex_0.y_true,'--', color=color, marker=10)
    
    return f'The model underestimates {pct_lower}% of the time'

# Add Features

In [4]:
#add features related to time
def df_add_timefeatures(dataframe):
    '''Adds features related to time that are generated from datetime index of dataframe'''
    dataframe['day_of_week'] = [x.weekday() +1 for x in dataframe.index]
    dataframe['day_of_month'] = [x.date().day for x in dataframe.index]
    dataframe['day_of_year'] = [x.dayofyear for x in dataframe.index]
    dataframe['weekofyear'] = [x.weekofyear for x in dataframe.index]
    dataframe['month'] = [x.month for x in dataframe.index]
    return dataframe


In [28]:
#add lag values to dataframe
def df_add_lagvalues(dataframe,y_column,lagvalues):
    for lag in lagvalues:
        #shift values 
        dataframe['lag_'+str(lag)] = dataframe[y_column].shift(periods=lag)
        #exchange the first NAN values for true y values
        #dataframe['lag_'+str(lag)][:lag] = dataframe[y_column][:lag]
    return dataframe

In [37]:
#get one value for lag of size n for step-forward prediction
def get_lagvalue(history,lagvalue):
    if len(history) < lagvalue:
        return np.nan
    else:
        return history[-(lagvalue):-(lagvalue-1)][0]
    

In [73]:
#add mean value for weekday in past n weeks to dataframe
def df_add_weekdaymean(dataframe,column,weeks):
    
    dataframe['weekdaymean_'+str(weeks)+'_weeks'] = [np.mean([get_lagvalue(dataframe[column][:idx],7*n) for n in range(1,weeks+1)]) for idx in range(len(dataframe))]
    return dataframe

In [75]:
#get mean value for weekday in past n weeks for one step in step-forward prediction
def get_weekdaymean(history,weeks):
    return np.mean([get_lagvalue(history,7*n) for n in range(1,weeks+1)])

In [169]:
def apply_featurefun(df,column,history_df=[]):
    if len(history_df) >0:
        dataframe = pd.concat((history_df[[column]],df[[column]]))
    else:
        dataframe = df[[column]]
    
    dataframe.columns = ['y']
    
    for i in [7,14]:
        dataframe['lag_'+str(i)] = [get_lagvalue(dataframe['y'][:idx],i) for idx in range(len(dataframe))]
    for i in [2,4]:
        dataframe['weekdaymean_'+str(i)+'_weeks'] = [get_weekdaymean(dataframe['y'][:idx],i) for idx in range(len(dataframe))]
            
    dataframe = df_add_timefeatures(dataframe)
    
    return dataframe[-len(df):]

In [174]:
train, validation, test = apply_featurefun(train_df,6), apply_featurefun(validation_df,6,train_df), apply_featurefun(test_df,6,validation_df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A 

# Grid Search Random Forest Parameters for Best Results