#Profiling Pecan Consumers

This notebook profiles the consumers in the Pecan St dataset who have HVAC using a thermal regimes model using whole-home consumption and temperature

##Initializations

In [1]:
rm(list = ls())
options(error = recover)
library('segmented')
library('lubridate')

In [2]:
setwd("~/EnergyAnalytics/batch/pecan/")
source('define_categories_pecan.r')

setwd("~/EnergyAnalytics/utils/")
source('select_data.r')

setwd('~/EnergyAnalytics/thermal_profiles/profiler/')
source('stateProcessorWrapper.r')
source('stateVisualizerWrapper.r')

setwd("~/EnergyAnalytics/batch/pecan/")


Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

Loading required package: R.oo
Loading required package: R.methodsS3
R.methodsS3 v1.7.0 (2015-02-19) successfully loaded. See ?R.methodsS3 for help.
R.oo v1.19.0 (2015-02-27) successfully loaded. See ?R.oo for help.

Attaching package: ‘R.oo’

The following objects are masked from ‘package:methods’:

    getClasses, getMethods

The following objects are masked from ‘package:base’:

    attach, detach, gc, load, save

R.utils v2.0.0 (2015-02-28) successfully loaded. See ?R.utils for help.

Attaching package: ‘R.utils’

The following object is masked from ‘package:utils’:

    timestamp

The following objects are masked from ‘package:base’:


In removeClass("DataFormatter"): class definition for “DataFormatter” not found (no action taken)Loading required package: lmtest

Attaching package: ‘lmtest’

The following object is masked from ‘package:R.utils’:

    reset

Loading requ

In [3]:
DATA_PATH = '~/S3L_server/energy-data/pecan_street/usage-select/'
DUMP_PATH = '~/S3L_server/energy-data/pecan_street/models_new/'
PLOT_PATH = '~/S3L_server/plots/pecan-street-new/'

In [4]:
# load user names
user_names = read.csv('~/S3L_server/energy-data/pecan_street/metadata/user_names_ids.csv')
user_names$X = NULL

# list already processed files
files.input = list.files(path=DUMP_PATH, pattern = '*_decoded*', full.names = T, recursive = T)
already_done  = lapply(files.input, function(x) {
    tmp = tail(strsplit(x, '/')[[1]],n=2)
    res = tmp[1]
    nfo = strsplit(tmp[2], "_")[[1]]
    uid = nfo[1]; nfo = nfo[2]; 
    return(c(uid,res))
})
already_done = data.frame(do.call('rbind', already_done))
if (length(already_done)>0) names(already_done) <- c("ID", "grain")

# list all data files
files    = list.files(path=DATA_PATH, full.names = T, recursive = T)
files_01 = files[grep('01min',files)]
files_15 = files[grep('15min',files)]
files_60 = files[grep('60min',files)]

# extract ID
users_df = data.frame(UID = as.character(sapply(files_60, function(s) strsplit(tail(strsplit(s, '/')[[1]], 1), '\\.')[[1]][1])))
rownames(users_df) = NULL
users_df['file_01min'] = files_01
users_df['file_15min'] = files_15
users_df['file_60min'] = files_60
    
# build data sources dataframe
users_df = merge(user_names, users_df, by.x="ID", by.y="UID")
    
# filter out those IDS already done
users_df = users_df[!(users_df$ID %in% already_done$ID),]

In [5]:
dim(users_df)

In [6]:
head(users_df)

Unnamed: 0,ID,name,file_01min,file_15min,file_60min
108,3044,Lucius,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//01min/3044.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//15min/3044.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//60min/3044.csv
116,3368,Forrest,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//01min/3368.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//15min/3368.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//60min/3368.csv
136,3778,Alexander,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//01min/3778.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//15min/3778.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//60min/3778.csv
138,3829,Kenneth,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//01min/3829.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//15min/3829.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//60min/3829.csv
161,4383,Ward,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//01min/4383.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//15min/4383.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//60min/4383.csv
165,4505,Milton,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//01min/4505.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//15min/4505.csv,/Users/adrianalbert/S3L_server/energy-data/pecan_street/usage-select//60min/4505.csv


##Learn thermal regimes models

In [7]:
# function to format data to work with the R thermal profiler code
format_data = function(homeData) {
    fillInTheBlanks <- function(S) {
        L <- !is.na(S)
        c(S[L][1], S[L])[cumsum(L)+1]
    }
    # some response observations are NA, replace them with last ok observation
    homeData$use = fillInTheBlanks(homeData$use)

    # remove observations with NAs in response
    idx.na = is.na(homeData$use)
    if (sum(idx.na)>0) homeData = homeData[-idx.na,]

    # temperature above reference
    homeData$TemperatureD = homeData$TemperatureF - 65

    # format data as expected by the HMM package
    cur_data = subset(homeData, select = c('date', 'use'))
    names(cur_data)[2] = 'obs'
    cur_data$date = as.character(cur_data$date)
    cur_covar = subset(homeData, select = c('date', 'TemperatureF', 'TemperatureD'))
    cur_covar$date = as.character(cur_covar$date)
    cur_month     = month(cur_data$date)
    cur_covar$TemperatureDWinter = cur_covar$TemperatureD * (cur_month %in% c(0,1,2,3,10,11,12))

    return(list(cur_data, cur_covar))
}


In [8]:
homeData15 = read.csv(users_df[1,"file_15min"])     
homeData60 = read.csv(users_df[1,"file_60min"])  
res = format_data(homeData15)

In [9]:
apply_thermal_model = function(cur_data, cur_covar, userName, 
                               dump_path = NULL, 
                               plot_path = NULL) {
  
    # define model learning controls
    controls = list(
        Kmin = 4, Kmax = 6, 
        maxit = 50, 
        nRestarts = 5, 
        tol = 1e-4,
        thresh.R2 = 0.75, 
        thresh.MAPE = 0.20,
        test.periods = 12,
        vis.interval = 3 * 24
    )

    # generate visualization interval; make sure there's data in there
    # TODO: there was an error generated here (indices for subsetting were messed up)
    ok = FALSE
    no.secs    = controls$vis.interval * 3600
    while (!ok) {
        idx_start  = 1
        idx_end    = max(nrow(cur_data)-controls$vis.interval-1, 1)
        start_date = sample(cur_data$date[idx_start:idx_end], 1)
        stop_date  = as.character(as.POSIXct(start_date) + no.secs)
        dat        = subset(cur_data, date >= start_date & date < stop_date)
        if (nrow(na.omit(dat)) > 0) 
          ok = TRUE
    }        

    # learn model
    res = try(stateProcessorWrapper(cur_data, 
                                    cur_covar, 
                                    userName, 
                                    controls = controls,
                                    train.frac = 1.0, 
                                    verbose = F, 
                                    resp.vars = c('(Intercept)', 
                                                  'TemperatureD'), 
                                                  #'TemperatureDWinter'),
                                    dump_path = dump_path))
    if (class(res) == 'try-error') {
        cat('Error in learning model for current user!\n')
        return(NULL)
    }

    # produce visualizations
    if (!is.null(plot_path)) {
        resv = try(stateVisualizerWrapper(res$decoder, 
                                   res$interpreter, 
                                   plots_path = plot_path, 
                                   interval = c(start_date, stop_date)))
        if (class(resv) == 'try-error') {
            cat('Error in visualizing current user!\n')
        }
    }
#     res_list = list(response=res$decoder@HMM$response, transition=res$decoder@HMM$transition)
#     ts_df = cbind(fit$decoder@data.train, fit$decoder@HMM$states, 
#                   fit=fit$decoder@HMM$fit, fit.avg=fit$decoder@HMM$fit.avg, residual=fit$decoder@HMM$residual)

#     return(list(name=fit$decoder@UID, 
#                info=res_list,
#                ts_data=ts_df))

    return(NULL)
}


In [10]:
f_wrapper <- function(i) {
    # load data   
    user_id = paste(users_df[i,"ID" ], users_df[i,"name"], sep="_")
    cat(paste('Processing user', user_id, ':', i, '/', nrow(users_df), '\n'))  
    homeData15 = read.csv(users_df[i,"file_15min"])     
    homeData60 = read.csv(users_df[i,"file_60min"])  

    # only process those users that have AC
    # if (!('AC' %in% names(homeData60)) return(NULL)

    # is there enough data?
    if (is.null(homeData15) || is.null(homeData60))  {
        cat('Too little data!\n')
        return(NULL)
    }  
    if (nrow(homeData15) < 30*96 || nrow(homeData60) < 30*24) {
        cat('Too little data!\n')
        return(NULL)
    }

    # create directory to store models
    dump_path_15 = file.path(DUMP_PATH, paste(user_id, '15min/', sep='/')); 
    dir.create(dump_path_15, recursive = T)
    dump_path_60 = file.path(DUMP_PATH, paste(user_id, '60min/', sep='/')); 
    dir.create(dump_path_60, recursive = T)

    # create directory to store plots
    plot_path_15 = file.path(PLOT_PATH, paste(user_id, '15min/', sep='/')); 
    dir.create(plot_path_15, recursive = T)
    plot_path_60 = file.path(PLOT_PATH, paste(user_id, '60min/', sep='/')); 
    dir.create(plot_path_60, recursive = T)

    # format datasets
    res = format_data(homeData15); cur_data15 = res[[1]]; cur_covar15 = res[[2]];
    res = format_data(homeData60); cur_data60 = res[[1]]; cur_covar60 = res[[2]];

    # apply model to data
    res = apply_thermal_model(cur_data15, cur_covar15, user_id,
                            dump_path = dump_path_15, 
                            plot_path = plot_path_15)
    res = apply_thermal_model(cur_data60, cur_covar60, user_id, 
                            dump_path = dump_path_60, 
                            plot_path = plot_path_60)  
    return(NULL)
}


In [11]:
# library('parallel')
# res = mclapply(1:nrow(users_df), mc.cores = 3, f_wrapper)


In [None]:
for (i in 1:nrow(users_df)) {
    cat(paste("*****", i, ":", users_df[i,c("name", "ID")], "*****"))
    f_wrapper(i)
}

***** 1 : 348 ***** ***** 1 : 3044 *****Processing user 3044_Lucius : 1 / 182 


In dir.create(plot_path_60, recursive = T): '/Users/adrianalbert/S3L_server/plots/pecan-street-new//3044_Lucius/60min' already exists