Skip to content

Commit

Permalink
optimized new weighted knn model
Browse files Browse the repository at this point in the history
  • Loading branch information
bdewilde committed Oct 11, 2012
1 parent cddd3e2 commit af7bde6
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 14 deletions.
7 changes: 4 additions & 3 deletions knn.R
Expand Up @@ -2,13 +2,14 @@
# https://www.kaggle.com/c/digit-recognizer/data
# produce submission file with optimal knn model

# Fast Nearest Neighbor Search Algorithms and Applications
library(FNN)

# load training and test datasets
train <- read.csv("train.csv", header=TRUE)
test <- read.csv("test.csv", header=TRUE)

##########################################################
# Fast Nearest Neighbor Search Algorithms and Applications
library(FNN)

# drop label columns for use in KNN
trainCl <- train[, 1]
train <- train[, -1]
Expand Down
Binary file added knnPerformance_vs_kAndKernel_10kSet.pdf
Binary file not shown.
Binary file added numK_10kSet_noZeroVarCols.pdf
Binary file not shown.
78 changes: 78 additions & 0 deletions optimizeKKNN.R
@@ -0,0 +1,78 @@
# Kaggle: Digit Recognizer
# https://www.kaggle.com/c/digit-recognizer/data
# optimize a knn model using the kknn implementation

# Weighted k-Nearest Neighbors
library(kknn)

# load the training data
rawTrainData <- read.csv("train.csv", header=TRUE)

# randomly sample rows of training data
train <- rawTrainData[sample(nrow(rawTrainData)), ]
train <- train[1:10000,]

# optimize knn for k and kernel
# using leave-one-out cross-validation
kMax <- 15
kernels <- c("triangular","rectangular","gaussian")
model_1 <- train.kknn(as.factor(label) ~ ., train, kmax=kMax, kernel=kernels)

# what about removing pixels with near zero variance? not good predictors...
library(caret)
badCols <- nearZeroVar(train[,-1])
print(paste("Fraction of nearZeroVar columns:", round(length(badCols)/length(train),4)))
train <- train[, -(badCols+1)]
model_2 <- train.kknn(as.factor(label) ~ ., train, kmax=kMax, kernel=kernels)

# what about centering and scaling (standardizing)?
# UPDATE: This gives identical results to model_2
#train <- rawTrainData[sample(nrow(rawTrainData)), ]
#train <- train[1:1000,]
#label <- train[, 1]
#centeredPixels <- apply(train[, -1], 2, scale, center=TRUE, scale=TRUE)
#train <- data.frame(label, centeredPixels)
#model_3 <- train.kknn(as.factor(label) ~ ., train, kmax=kMax, kernel=kernels)

#######################################################################
# plot mis-classification rate per k, per kernel, per data modification
plot(1:nrow(model_1$MISCLASS), model_1$MISCLASS[,1], type='n', col='blue', ylim=c(0.0,0.105),
xlab="Number of Nearest Neighbors", ylab="Fractional Error Rate", main="kNN performance by k and kernel")
# model_1
for(kern in kernels) {
color=rainbow(length(kernels))[match(kern, kernels)]
points(1:nrow(model_1$MISCLASS), model_1$MISCLASS[,kern], type='p', pch=16, col=color)
lines(predict(loess(model_1$MISCLASS[,kern] ~ c(1:kMax))), col=color, lwd=2, lty="solid")
}
# model_2
for(kern in kernels) {
color=rainbow(length(kernels))[match(kern, kernels)]
points(1:nrow(model_2$MISCLASS), model_2$MISCLASS[,kern], type='p', pch=17, col=color)
lines(predict(loess(model_2$MISCLASS[,kern] ~ c(1:kMax))), col=color, lwd=2, lty="dotted")
}
# model_3
#for(kern in kernels) {
# color=rainbow(length(kernels))[match(kern, kernels)]
# points(1:nrow(model_3$MISCLASS), model_3$MISCLASS[,kern], type='p', pch=15, col=color)
# lines(predict(loess(model_3$MISCLASS[,kern] ~ c(1:kMax))), col=color, lwd=2, lty="dotted")
#}

# mark the best values of each set of models
model_1_best <- model_1$MISCLASS[model_1$best.parameters$k, model_1$best.parameters$kernel]
model_2_best <- model_2$MISCLASS[model_2$best.parameters$k, model_2$best.parameters$kernel]
points(model_1$best.parameters$k, model_1_best, pch=16, col="black")
points(model_2$best.parameters$k, model_2_best, pch=17, col="black")

legend("bottomright", ncol=2, legend=c(kernels, paste(kernels,"(red. data)")),
col=rep(rainbow(length(kernels)),2), pch=c(rep(16,3), rep(17,3)),
lwd=2, lty=c(rep("solid",3), rep("dotted",3)),
bty="n", y.intersp=1.5, inset=0.01, cex=0.8)


# print out the best parameters
print(paste("Best model_1 parameters:", "kernel =", model_1$best.parameters$kernel, ", k =", model_1$best.parameters$k))
print(model_1$MISCLASS)
print(paste("Best model_2 parameters:", "kernel =", model_2$best.parameters$kernel, ", k =", model_2$best.parameters$k))
print(model_2$MISCLASS)
#print(paste("Best model_3 parameters:", "kernel =", model_3$best.parameters$kernel, ", k =", model_3$best.parameters$k))
#print(model_3$MISCLASS)
32 changes: 21 additions & 11 deletions optimizeKNN.R
@@ -1,18 +1,18 @@
# Kaggle: Digit Recognizer
# https://www.kaggle.com/c/digit-recognizer/data
# optimize a knn model using the FNN implementation

#library(class) # "recommended" k-Nearest Neighbour Classification: TOO SLOW
library(FNN) # Fast Nearest Neighbor Search Algorithms and Applications
# Fast Nearest Neighbor Search Algorithms and Applications
library(FNN)

# load the data
rawTrainData <- read.csv("train.csv", header=TRUE)#[1:25000,]
#test <- read.csv("test.csv", header=TRUE)[1:1000,]
rawTrainData <- read.csv("train.csv", header=TRUE)[1:1000,]

# randomly permute row order of training dataset
train <- rawTrainData[sample(nrow(rawTrainData)), ]
# then split up into training and cross-validation sets
cv <- train[(1+(0.6*nrow(train))):nrow(train), ] # 2 needed instead of 1 to avoid overlap row?
train <- train[1:(0.6*nrow(train)), ]
# randomly sample rows of training data
indices <- sample(1:nrow(rawTrainData), trunc(0.6*nrow(rawTrainData)))
# assign training and cross-validation sets to orthogonal sub-sets
train <- rawTrainData[indices, ]
cv <- rawTrainData[-indices, ]

# drop label columns for use in KNN
trainCl <- train[, 1]
Expand All @@ -26,12 +26,22 @@ badCols <- nearZeroVar(train)
print(paste("Fraction of nearZeroVar columns:", length(badCols)/length(train)))
train <- train[, -badCols]
cv <- cv[, -badCols]


# what about centering and scaling? i.e. standardizing
trainColMeans <- apply(train, 2, mean)
trainColSD <- apply(train, 2, sd)
train <- apply(train, 2, scale, center=TRUE, scale=TRUE)
#cv <- apply(cv, 2, scale, center=trainColMeans, scale=trainColSD)
#train <- apply(train, 2, scale, center=TRUE, scale=TRUE)
#cv <- apply(cv, 2, scale, center=TRUE, scale=TRUE)
cv <- sweep(cv, 2, trainColMeans, FUN="-")
cv <- sweep(cv, 2, trainColSD, FUN="/")

# fit knn model to training set
# get knn predictions for training set
# compute fractional training error
# for numK values of k
numK <- 6
numK <- 10
trainErrs <- numeric(numK)
for(i in 1:numK) {
print(paste("Train:", i))
Expand Down

0 comments on commit af7bde6

Please sign in to comment.