-
Notifications
You must be signed in to change notification settings - Fork 28
/
prepare_data.R
69 lines (54 loc) · 2.39 KB
/
prepare_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Burak Himmetoglu (burakhmmtgl@gmail.com)
# 03-05-2017
# Human Activity Recognition dataset from UCI
## Prepare data for analysis ##
# Make sure that to downlad the data from
# https://archive.ics.uci.edu/ml/machine-learning-databases/00240/
# Function for calculating multi-class logloss.
mlogloss <- function(preds, actual){
#
# preds is a matrix of N x n_class
# actual is a matrix of N x n_class (one-hot encoded)
preds <- apply(preds, c(1,2), function(x) max(min(x, 1-10^(-15)), 10^(-15)))
score <- -sum(actual*log(preds))/nrow(preds)
# Return
score
}
# Librarires
library(dplyr)
library(readr)
library(caret)
# Feature names
feat_names <- read_delim("./data/HARdata/features.txt", delim = " ", col_names = c("code", "feature"), col_types = "cc")
feat_names <- mutate(feat_names, code=paste0("f",code))
# Number of features
n_feats = dim(feat_names)[1]
# Load training data
X_train <- read_table("./data/HARdata/train/X_train.txt", col_names = paste0("f",1:n_feats))
y_train <- read_table("./data/HARdata/train/y_train.txt", col_names = c("label"))
# Load testing data
X_test <- read_table("./data/HARdata/test/X_test.txt", col_names = paste0("f",1:n_feats))
y_test <- read_table("./data/HARdata/test/y_test.txt", col_names = c("label"))
# Function for rescaling and centering features
standardize <- function(x){
(x - mean(x))/sd(x)
}
# Combine train and test (flag by is.train to split back later)
X_train <- mutate(X_train, is.train = 1)
X_test <- mutate(X_test, is.train = 0)
X_full <- rbind(X_train, X_test)
# Standardize (column (n_feats+1) = 652 is `is.train`, which is just a flag)
X_full[,-(n_feats+1)] <- X_full[,-(n_feats+1)] %>% mutate_all(funs(standardize))
# From the exploratory analysis, we know that there are duplicate columns. Remove them below
X_full <- as.data.frame(unique(as.matrix(X_full), MARGIN=2))
n_feats <- dim(X_full)[2] # New number of features +1 (from is.train)
# New feature names
rm.cols <- setdiff(colnames(X_train), colnames(X_full))
feat_names <- feat_names %>% filter(!(code %in% rm.cols))
# In certain cases, we will also need the principal compoenents.
# Keep principal components that explain 99% of the variance
obj.pca <- preProcess(X_full[,-(n_feats)], method = "pca", thresh = 0.99)
X_pca <- predict(obj.pca, X_full[, -(n_feats)]) # 185 columns left
X_pca <- mutate(X_pca, is.train = X_full$is.train)
# Remove unnecessary variables
rm(X_train, X_test); gc()