In [1]:
rm(list = ls())
setwd(getwd())

# Load libraries
library(foreign)
library(lubridate)
library(caret)
library(Metrics)
library(ggplot2)
library(vip)
library(earth)


Attaching package: 'lubridate'


The following objects are masked from 'package:base':

    date, intersect, setdiff, union


Loading required package: lattice

Loading required package: ggplot2


Attaching package: 'Metrics'


The following objects are masked from 'package:caret':

    precision, recall



Attaching package: 'vip'


The following object is masked from 'package:utils':

    vi


Loading required package: Formula

Loading required package: plotmo

Loading required package: plotrix

Loading required package: TeachingDemos



In [2]:
source("Methods.R")

In [3]:
input_file <- "Input1.csv"

# Preprocessing

In [4]:
# Load input file
input <- read.csv(input_file, fileEncoding="UTF-8-BOM")

In [5]:
# Get input data from sheet in convenient form

#1. Methods
tmp <- input[,c('Methods', 'Methods.1')]
methods <- tmp[which(tmp$Methods.1 > 0), c('Methods')]

#2. Independent.variables
tmp <- input[,c('Independent.variables', 'Independent.variables.1')]
ind_var <- tmp[which(tmp$Independent.variables.1 > 0), c('Independent.variables')]

#3. Tested lines
tmp <- input[,c('Tested.lines', 'Tested.lines.1')]
t_lines <- tmp[which(tmp$Tested.lines.1 > 0), c('Tested.lines')]

#4. Maximal size of testdata
size <- input[1,'Data.1']
#size <- tmp[1,'Data.1']


In [6]:
# Load predictor data (all but lineflows)
predictors <-read.csv("Data/Dataset - Lines and predictors.csv", fileEncoding="UTF-8-BOM")
predictors$Time <- as.POSIXct(predictors$Time, format = "%d/%m/%Y %H:%M")

In [7]:
# Load unprocessed line data
lines_raw <- read.csv("Data/Lineload_rawdata.csv", fileEncoding="UTF-8-BOM")

In [8]:
# Data cleaning

# Use proper time labels
lines_raw$Time <- as.POSIXct(lines_raw$Time, format = "%d/%m/%Y %H:%M")

# Delete lines with more than 300 missing values
mv = colSums(is.na(lines_raw))
lines_with_missing_values = colnames(lines_raw)[which(mv > 300)]
print('The following lines will be excluded because they have more than 300 missing values')
lines_with_missing_values
lines <- lines_raw[, !(names(lines_raw) %in% lines_with_missing_values)]

#Delete remaining missing values
lines <- na.omit(lines)

[1] "The following lines will be excluded because they have more than 300 missing values"


Introducing lagged values (default is 24h time lag)

In [9]:
lagged <- function(df, lag=24){
    # lag needs to be specified in hours timelag
    lagged_data <- df
    colnames(lagged_data) <- paste(colnames(lagged_data), ".lag", sep="")
    names(lagged_data)[names(lagged_data) == "Time.lag"] <- "Time"
    lagged_data$Time <- lagged_data$Time + 60*60 * lag
    return(lagged_data)
}

In [10]:
covariates <- merge(predictors[, c('Time', ind_var)],lagged(lines),by="Time")
covariates <- na.omit(covariates)
covariates[ind_var] <- sapply(covariates[ind_var], as.numeric)
#colSums(is.na(covariates))

"NAs introduced by coercion"
"NAs introduced by coercion"
"NAs introduced by coercion"
"NAs introduced by coercion"
"NAs introduced by coercion"
"NAs introduced by coercion"
"NAs introduced by coercion"
"NAs introduced by coercion"
"NAs introduced by coercion"


# Performing training and estimation

In [None]:
set.seed(123)  
results = data.frame(row.names = t_lines)

for (line in c(t_lines)){ # 
    
    #add lagged line if not included in covariates and delete NAs
    covariates_l <- covariates
    covariates_l <- merge(covariates_l, lines_raw[,c("Time",line)] ,by="Time")
    if (!(paste(line, ".lag", sep="") %in% names(covariates_l))){
        covariates_l <- merge(covariates_l, lagged(lines_raw[,c("Time",line)]), by="Time")
    }
    covariates_l <- na.omit(covariates_l)
    
    # select training and test data
    trainingRowIndex <- sample(1:nrow(covariates_l), 0.7*nrow(covariates_l))  
    trainingData <- covariates_l[sample(trainingRowIndex, min(size, nrow(trainingRowIndex))), ]  
    testData  <- covariates_l[-trainingRowIndex, ] 
    
    # perform training and estimation
    for (method in methods){
        if (method == "MARS") {
            results[line, method] <- mars(line, trainingData, testData)
            
        } else if (method == "Linear regression") {
            results[line, method] <- lin_reg(line, trainingData, testData)
        
        } else if (method == "Random forest") {
            results[line, method] <- ran_forest(line, trainingData, testData)
            
        } else if (method == "Multiple linear regression") {
            print('Function is not implemented yet')
            
        } else if (method == "kNN") {
            results[line, method] <- kNN(line, trainingData, testData)
            
        } else if (method == "Support vector regression") {
            results[line, method] <- SVR(line, trainingData, testData)   
        }
    }
}

"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficient fit may be misleading"
"prediction from a rank-deficie

In [None]:
results