From af7bde6a8d7ab6007c098f6e7f8ac8159bd20426 Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Thu, 11 Oct 2012 16:23:07 -0400 Subject: [PATCH] optimized new weighted knn model --- knn.R | 7 ++- knnPerformance_vs_kAndKernel_10kSet.pdf | Bin 0 -> 6700 bytes numK_10kSet_noZeroVarCols.pdf | Bin 0 -> 5162 bytes optimizeKKNN.R | 78 ++++++++++++++++++++++++ optimizeKNN.R | 32 ++++++---- 5 files changed, 103 insertions(+), 14 deletions(-) create mode 100644 knnPerformance_vs_kAndKernel_10kSet.pdf create mode 100644 numK_10kSet_noZeroVarCols.pdf create mode 100644 optimizeKKNN.R diff --git a/knn.R b/knn.R index c068170..4838b28 100644 --- a/knn.R +++ b/knn.R @@ -2,13 +2,14 @@ # https://www.kaggle.com/c/digit-recognizer/data # produce submission file with optimal knn model -# Fast Nearest Neighbor Search Algorithms and Applications -library(FNN) - # load training and test datasets train <- read.csv("train.csv", header=TRUE) test <- read.csv("test.csv", header=TRUE) +########################################################## +# Fast Nearest Neighbor Search Algorithms and Applications +library(FNN) + # drop label columns for use in KNN trainCl <- train[, 1] train <- train[, -1] diff --git a/knnPerformance_vs_kAndKernel_10kSet.pdf b/knnPerformance_vs_kAndKernel_10kSet.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f614dc0327caa512a1578bd81a1a356901de97f9 GIT binary patch literal 6700 zcmb7}c{r5q+s7?MMve3*5aM=oaR5jU@ASX!nPP}o za+Wbr%M&1 zyB#3N)W;nS1R;<_Bo^-i1Q{Y-&;%fi9AFImJ3;1O36wZWv?m%z{3{%Uz~hMIB>^aV zlnydR6YyT1DDrty|8*WE;MaXzMdMtEu0W_X1OfzUVz5N=CxA4uO5X$` z`O@3~-WkJby77E6Ou?%^FA_84)DzT;O>5<~Il<3qh4>wquQ086e0ZT0uSepJ{kg4c zoY(qQysGe`YOvCn5|-n8PEq`$k_I2Q`dz1k3ID^LZWX)K^p3k#eRBDSLG?9Lva?s; zvb_~i%ZdNlS3mUwdmvS8!KFvXn;*U5L;&Zqr_(5{g9rx9FTVzPS9nZ3J~-@UHs5j6 zpHoS<59&aEbgTEdd#LQDsN`@Eo!u5ue9FVc@d~^vs%oy+!;h`7D`F&0!h~<-8s7@F z^RgNqKjTWzg(WsLM3QueH4OR-A8jkK+De4=A{ zyM2SR=D;Mr`cM599n)+4w@ z&ILU@>1RAi|3fVAbv2~$JE79#^iK_Qt}GuF2iXgD;&mX$N__*+cY-M*1Tv z^;WPQ+ylov7ha%7I3G4u`vy`Gx1JHS1&e|_2jUCwVCP*9C}u0e;O#{R8ioAxeJPeb7l*#xL+0cR6nZ!Mo~&p?=qHmd)=TR5(fFJV zeEQt~QV8eSm@46nvMjYqxr)6?sokt`WtG0fao_ZbQP_u!(r!BcM^cOWtd{OygsOPs z`{NnVT9=p2SjP>Eed`$FXo*c6@}al7ZFYkb0-I>L6uYTstA)xhJy51%vRP_(x_@Kt zEp!p_$yJXedL5s7apjnD1hkgJGrYqQRv`1vND>kJt-MD9&N>6R(W`p7ncbWx!l$kSXuk?1e zZ&y@hEZ``((L=kLRZeG>}*Fh+b|wUKHAU0JYNIj z0M5-G1LwfB*^A3eu(d8VZt(fk814`rx$`Zh&37iww^75W%#mXbn)YF#JPrflshCKf z=}I7TNYGVNTPHzMMWJayNKCUOyFfyhs8D(z8L=*hTy zqntzsn+UEKp}hfac_Ucbsdmiy#QL>e`!LIe;n@o*pNy<4JZNBF+RBXZUnU123iWXz3)SCZ$<2^sF5l+M7BkPXU zOd2LJB7U^gb0NG=#WTgj7D(PwC+Er|<%d%?Z9jE}oVe*dLnFX5blC~jJcb^@&@o#S z`MqmSbu41EXfcqkb&r`uze~_-KEbLjI&A}~IFYrVqsK@b9w*0&zbuo%rw0==P85qs|@oXHSw)V+GO=5uQvZ5S3t7&+VMQ!=H zf`5@$hHM3>G>%OR?R2-6=@cK)Hd5bQMm3|Os#cHhykVt)*V!e~gOcUvheZp`jAom_ z!m-x!j}{tao^Tt7l6`?tem5JhGqX| z)2Rz-{DL=psj}`oS_rx9p&vt8Ih1b0-ZFN z6Qq^gb@N*p8d2PXh(pOx=5zUaW*%pBEIuYrid%I zu;Mwb7~09OQ##W-@*O!_pE-oZ+(r^}HgW(WXH<&$!uUcNGa5ZkEtiXXl*~pqcga@d z9m84m#jrZr1nd)Rz&{)0G@tSdC$81_-ZNDDoX;1AihT=_O)lcg@8BFyZ1T|igZs8) zFR#3tm<`^UT!106ZXX0ldu z>T(73mQz^m_oJhp?f%$ZRp{%@V9vIee3&)0WunZvC?Tj-tZ>t=^74blm%E-er<#@mcYg4juuQ58ajAQqv6$;4qvTC1ZX{{n#!jh+ZK@l2g6Zmf>Z*Vu zzsQzJ$}YXEtzDl6(ST?PEzVM$b>9?Phn5iy`lRTqp__yxX*T*i%3M@Tr?MRv5HZ-yq+KnjK$18vE;iuWR8Cz%)Z&`ubn z8r~ad4JPkN%fNwBFsSW6$dy8#zetlz$S4yu5oiq}Gdj5f+M8$#0BMq`6{x64W=;qY zLOI2l{0aq(z0g0YqwQ2JN+ z-vCXKC~a7St{@Oer`VrZ6y$QUs)qzKKDhDTjje$(RlL4a;(H za+m*xWlB%KVL3DTmZ`}&XC$dXror(3>6pa+8?MGQ+NT<{6C^J{7*6Xn-J=qOtDCy3 z8zmH+fQn5A=OYqc2*%u|HhQsqj7NRQ!=0Ts`HE91&s4q7YH#|Kz;@t`>;}@L%WA+# zE!DjrVaMNGwO|N7nQOuRX}&G$Zmf#~J@v^oDq1Ynw0lK`8t0BCO)8E5*d1&t+YVPl zEfa8e|H7ckoo1@T?8F*NK?s#$%ANW3opBo;t}a<6^9GP2r7(tB|^0w|M>-v zzdoEw?EZx09Kpm1D}lN%1`U;%^`V;9?y_Hd2eRlPv2`=I_4QauvI-SO@rc-%MBIJA z7W;`r(wmQM>j5*#+&w=78B67{x6dMY%{gHlu z`)0CTc+giKb%C9hE%U(CQ&i6j&@adLBHb_?RJP01hIKz9dmTVjG3AY3OaUJJwf=U- ztuvQgEkFUj*{#!70M=cM~(|!k8@WSG${0QH2%&9id=Z;v&D}di=KUTlv3UL8p=VyrS zuVn3Q6!oJnwqBdIau;!&=cYto8|7TExNuBtjos|xkQniTjgfajl)04Yrm4--lroE4 zZ*>ptZ^f*3UE0y5twvUlP4kJSixKZCq1VPW&O4lV!zZDHG}wM6eG4T z<|2kN4KFc3o74<>4QVb}F3FxkzzUA{8)F_<;$6ck#W`YPGIc6-vUKjg_23Xco69>u zyRf(k-9zqq?I~Q7xMpyzaYIrkzYNGe`9@@Ni|gk5nsHw)a(-eYT)J24yp*}?*FnDijFERE>w|CKmAsSB z|8AQ8Y$Z3dqye!XzV(RdT8ExiP6cBAQ6DM-(DERmugGZ=Qb3gU`spQH`@J@-7UtXT zYvg;|cg1&mlVR`DPobY@_Zs$0=;Z0n(H*BNrK>viE#guOdy8$v&xo?SckVjgfpdHD zC_&9&OyE8loOI(2HMob2rSm%n&t7)aTc{pv(#?1LB)mdK7xDzc3ythO+jD%bt~XMm zpHrVxEtW2p_bTh)gVYu*I?n2yM3@AlMBSxqOCF0hYqyG}sy$aXd?s$vHJ~HHk7x^i})Tdz*EeshTJAH%S3F7hFfx2h^>>tm^`fubUf355mIo!#Z<&k?#hY zAOkN(zR|_D15j?xnYYvweB?@R%Fk_%j5k|;@gLje;wsQh(~a%U_i5kQ**G(Qw`Qrv z(`MQBG45oQdQ}6?_?GRhIKQxcTIK-eEUgC};f2L6CG(d_E_Ne!f@Mr)qgajzR@t6zWQ4BH@iM^%!EV7-^;GpMdA|7^`D}UV`Su;&9f=*@9I6kp?B&hY2YjrrIn84ExZRunlNrMNdUc#iQA$N^2`F`9csqc^}0pZ12$gUg;+J)4p`?{KY!&w zd_4bl?^=J!<0ro6%Vm;QN{#LK-jJ8!FEKZQ#405(f&6PEoox*g%m&i(UVIN+wvOe9 z(P)!_S7VDlM7q{xDH>R?J+I1QPhbU*15VM`?A%Jydx3rQQ%}Zd4WfT(?};ipOT$7H#H!DCF4I6v!66*-cXiR z$LYTEuJcF2r)8s2{73ETa}8Z}y9;J}rSnw;9BIOmz!gySIqO|R%zoN7Z=lrA7q&}4 zCFLb0`89{;Fa^}a3sDvH;bvg{!ns{!TX>)P#`zCg9U=juAQihE?;lr29;6nwsa!m? z+3)#0e(SaO4=9*JLHfXE^T%LWx_!~>vBP&?(W}I<&7bBxWzKdF>y3g4(-O;Whjl5p z*kCoQFIF$Aqy#SPa&Lb7T6o7GJCJred}IG#1TOa*LjiwHhBa{HAuI+*9t~5l_3xEA z26q`p!2A*bAa#thGnzaOrp$Y-fl|^SHw?~;0F;A+JiPEkG#2el1VE&uKu&n_2%A8_ z06{LE$eU!wMWVbYsEfjQqP*Okv1o4~h={>DktH`I$`g+Rf*d{3lt6MO5``j9m;oRs z47n5mLjZy#ad@H=+8GG)JZcg}HUwBK@*ldZ7tRIg>E(t+dJ%yjybB(OzCo^rJUW>` zA$QwMx;IYlHp4}GK?xh1k^IJ%cVW2}(w&g$eg9FgQ9 z_}}nR5GdIQ{%0H*3L%@v|JLOGrAf;|$!6|9>!vFvP literal 0 HcmV?d00001 diff --git a/numK_10kSet_noZeroVarCols.pdf b/numK_10kSet_noZeroVarCols.pdf new file mode 100644 index 0000000000000000000000000000000000000000..23e7cc8a73b21d901946a2447f1771638ba2d619 GIT binary patch literal 5162 zcmb7Ic{o&iAGch}w8)kw;wUl-$BY@f7KH3u_HB$A<{C3&mh8I_lBKjFOSHK}vPDUj z5+hqlW$Plclr@z1jC<=YJ@5NG@0owj?{mJ}`JQJ!&-e3{vCz|3Kq{hPGLgfPBay7g zE@v7H2`B*+k6&S0S};(ThQ~696tW(cfd>$J8Ym?s3aJGBP|9i;WjPpVO7Z-!&uoYc z669qC7|^g(Zz7Hkm@^qvCIfP_piw-TI6RI06s3s7C?X+nBN>H_$-xdCg5k-Y>@rY4 z$^k%|Kq?*pb+HUAiQ)x-7FaJl9Z-f0tbo59RDW_{+p(=_crxQpb5NH;WnchBu(DiUvS^B8dUL0jN)cnxluuQ9SYNvgr({XTGq2 zt3`G!dY94Gy4B(iq;`o5G2HEg1wli1m-ndOI2GXwlOuAO4LpEG2U-4^%y ze0jEv&(dl>;LPz@JEYjmf5z=$%oUeYMn2mbCZ`G3#E!uzFR?pPRXxU)x8K$KQ;!q3 z)FXWA^^3VUujh+DP2(?tS0I`X*W6(US@RFOWwm`FQfJ+m3#oIJMLQIiXXnWe=*+aPrE6F5IagEf8(>wiH$$tV6XLv6v~wtd{S{@qG#COU0Q!mK@Cskx*DC% zJs)y~LpGVb6}TY3!beReS0$f5m$~CDl@Yk*WuF-$b7;IBc~!V^tUed!a=)QCN7VnHKfIwJ2g6#`(}fMZr=+Y4$Sb z_4a|wwF9%<{Uc6gOHN^fK_Nbi-)dv3*Mc5)7sofT2EbguAT!@Uv7wH)GW~}6A;Y19 z@D#3e;mZEYdG;Qpd)#;SoW&%WHdKab^`@n|jNkRs>czfc!I_+1fk`H!`>vDaM=Tpt zDm(_xe7;OtaJ^@GW0>3)c~~8DENMX6rM)ScM_hetX;culeSKF0!o+lOAmTH90JoqQl{?jOBr3TBrTRj2V5}x zcz>VadiinEVzBzC<;qAkdUG1?!r4v9-qCpkOzBP1=sD~ek2LcvjXSA{rHR&Y7@B;W z&}imK`*g}n!b-`g!?FG6e%tDEr=R@zhJ7rCmA}0!4kP)##6+U^V!(%v8u7#XuL-HuvXYG$0@&^c8(53^QJXFdb zL7m-N=&@|Tttve5oNN;KB9~-ru6;*>r?AM#FQRqP4gUJh*15>j88#*kJ6RJ`)iFW+a(|_`tyJNCo8c@ z=MPf&&Psof5ok>GqytU>n}nK?mB%86>x5@yL&6XQv z2p$Vy7+5pdybXCkyaZK3qR_0VSlk8z(ya`10L*vuzvz<`r|++RludzTb`nqzf0Bs9vx9rRlo9zVcpR`oc^1YQ9oaS5xv;@U261hr6s@Nn1LZ;TjV@4cC)gZdkMnyR?m? zqyS&uw-V<|6yb20y8rS9yawL@2F}H{MNmPfpT0I$-_d0m{%^PQ9&TJn!aC~MZ zdwaO9TL{al?(NL)*nqiowKN7&*Hx8F2rAHf($YRm{jzm5J8oHW(>JEXPJuP)x_I#1 zF}>CT6Tl>++^Iz-!OOfRfR0w7?qWpZ z2GMGp6eC;Re|f`crL;XOLT(oJ{K$@&Q!P70tl}7eZi{&7Df<+$_uM?X0v6)uHp}Sh z7;=dmnY?CoL;P0kL=IuxX&k}WUCpsgx8jZk(;O_u=BCLPHKQOpMccu6v2Eh z;*r-W68Vx<35KbKEl$F`$BrQ_OwMh2CMj}iI5#85Hb*7zmgOdk%o~TgPjn-Gg)In} z>igujy=WijV90C0GZPi{?XF$v*fU->E-q=HX z1#e2i7TmSJqtPoqnXs*qR^mZI-iHP62o?5eX_<@5)X7*H5~Nk+BBiB#rKV&v_rBd*3#32Q_vEK(IhefDA@9$Y z8jyWxdF_DxfnN}_!nOyy5sU-QmH`Fvc4(V<8|OP0i|um)^!yAz-4t|vYIwTniKXLp zn>>cihuF@0s1tqq>Ne`Sfn~JB&pf{y2+KFd594RrgC>^cUO1CP5+w%T}Ox)i7`I!fBpDE?e-_e zcHiVy@#(8+>TzRldSftc=zVBA@98e_j;p<$y`Q_Db>8aK$Xl?v^4nBS=B--Y)tJTn z?I)T|4X%~xj^(%GVqp#GsqHsB=aB^r4D%~frh1_|x>`A?DabPDRM1q=(mdaa{8#C( zdsb>!ta&wfrFeOGi+C%xeTtQD5N>dZ{Tf?*Ch3evlFBY7TpMMpyj`hXm5izL(NXbJ zbs%)Q)7pe_&rqhyZ+(OMwyKn=8Y6Eb#ZYl=d!F-**0jYbbnGVv|&46JVL&>+xv+(u?dFrC1m>R zXa=emozxhe@9nF1_!#o)i>PRU@nz%0*1W)`x#hXtV`r)+s%XxWF4xIhEA%RA$yWX@ z{>j17tK9sd{85*iWqG;wy6!FBUi^~ewP>^`M|slEs(BLJ5jQOKyY&=( zz>cNbSJC7DjP;IkYCtahUU2;8a>t8@5!LQbqcV7k z_tx%h-8-XqN$+@WP@YJhQ0|pHx8{K6bIpdhA`iPJ*MzQkp(o5{+(_QYmEl6s75O(+ zRIU0$Ich^Idq<81CMwGJ+Fz|(UKVVWxhi8I_~JD3w8H5b^_;7`?em=x*UOCyD@oIe z(?VIBD4Qs0Gta;=bLp!6=Vi5xPMW^U(z-O9WtIBb=Fv`@Em?&tg}vA7%(j^JXD`ZW z@2}P7zL0v+&&E)ofNvF5t~`f|D|=p+P?q0%^}>|1iVRE;gV*g#EUjro_p!Pop6Hdj zqArhQSf|hJSJhZD%%Wu}QQWKgKAX16eh!^`=o;x*@AAUwnp4SrcgFRxXDcT|SUby##1i9DV@J4ip9|iiynG^*X|!N?#?Cajh%i;~ z`ejXEP4UN1uexjtPuMvwk&E4D+JYXgwDac0CWKxLRbQ1*(N|pzZ(FIZI-Ym)-iL;; zp|B5;@!X+p$!%>4mlg0u)Z6*Y&+d=wTL%KdyWTdDZc_>?E=X|2NykZqUs;}7XgC@d ze6}R{UZ~fyi|?lLynLYv=XdHo=R0Mlk|!k&?GCtC(R{sdB)D~ENwQ1ZmhSv}Kp)@Z z)44*~W~13x51nUDeE9Ib3s-9#G;{T|HoXXZK$#_V^v#Whurfa^eLNiaZei{2mUhu( zS5KFZ)$ge{>5sy$jci(#)ZpF2s~IOAR}E^qOvAHOLxvMp&v)6?7H8G$q)v6VtfdZ3 zTE3*bGpv-VeOmKn+;*jCtb$Hvy>Xz6hE}}K>a0yzy}TqAhW>ijWdhJXG@-3A>s}wN ziF);p*1GmKaBU+v(<1lJ1>EW_!83X{I;H5dJbl&fa$&Kx{^ zF>L(HuK7>Xg-PbwVcbhGbE`j1;OaldP}rY)fn#JygA>WnVt_ri{&lh=l8=(<#P0wM z)FToIcxV~GR{Kr>8Uy+g$xJ$+t^)coDGWRbPhh~1Xf)_affg8aIuQW9XxNj`jEluF z*`qFwNW(FG2_$>~05XUqPYCkG;%F2y0D92yY(vNsi^DFmkf>x$27!~ zPU0Cv92N$C?=`^wN`GI@YzUP9BLO5N*dOzcqJJb%(4mk#|3{Vp8lC`C0+7lur5_id zg25LqWQUeF5Yj7*YjNVgJOC%F56N=wBH6XI~@+11Y0_*7T*2K`gbXcZJB+5Q<1t%`z#;CGBc!xBk&8nn;@Kx<+U9_kAK9w!qh ikSyI0j#d;31NfmF=?pB5vC%=a8VXts${aJ-hy4dV8oosU literal 0 HcmV?d00001 diff --git a/optimizeKKNN.R b/optimizeKKNN.R new file mode 100644 index 0000000..c83e6d8 --- /dev/null +++ b/optimizeKKNN.R @@ -0,0 +1,78 @@ +# Kaggle: Digit Recognizer +# https://www.kaggle.com/c/digit-recognizer/data +# optimize a knn model using the kknn implementation + +# Weighted k-Nearest Neighbors +library(kknn) + +# load the training data +rawTrainData <- read.csv("train.csv", header=TRUE) + +# randomly sample rows of training data +train <- rawTrainData[sample(nrow(rawTrainData)), ] +train <- train[1:10000,] + +# optimize knn for k and kernel +# using leave-one-out cross-validation +kMax <- 15 +kernels <- c("triangular","rectangular","gaussian") +model_1 <- train.kknn(as.factor(label) ~ ., train, kmax=kMax, kernel=kernels) + +# what about removing pixels with near zero variance? not good predictors... +library(caret) +badCols <- nearZeroVar(train[,-1]) +print(paste("Fraction of nearZeroVar columns:", round(length(badCols)/length(train),4))) +train <- train[, -(badCols+1)] +model_2 <- train.kknn(as.factor(label) ~ ., train, kmax=kMax, kernel=kernels) + +# what about centering and scaling (standardizing)? +# UPDATE: This gives identical results to model_2 +#train <- rawTrainData[sample(nrow(rawTrainData)), ] +#train <- train[1:1000,] +#label <- train[, 1] +#centeredPixels <- apply(train[, -1], 2, scale, center=TRUE, scale=TRUE) +#train <- data.frame(label, centeredPixels) +#model_3 <- train.kknn(as.factor(label) ~ ., train, kmax=kMax, kernel=kernels) + +####################################################################### +# plot mis-classification rate per k, per kernel, per data modification +plot(1:nrow(model_1$MISCLASS), model_1$MISCLASS[,1], type='n', col='blue', ylim=c(0.0,0.105), + xlab="Number of Nearest Neighbors", ylab="Fractional Error Rate", main="kNN performance by k and kernel") +# model_1 +for(kern in kernels) { + color=rainbow(length(kernels))[match(kern, kernels)] + points(1:nrow(model_1$MISCLASS), model_1$MISCLASS[,kern], type='p', pch=16, col=color) + lines(predict(loess(model_1$MISCLASS[,kern] ~ c(1:kMax))), col=color, lwd=2, lty="solid") +} +# model_2 +for(kern in kernels) { + color=rainbow(length(kernels))[match(kern, kernels)] + points(1:nrow(model_2$MISCLASS), model_2$MISCLASS[,kern], type='p', pch=17, col=color) + lines(predict(loess(model_2$MISCLASS[,kern] ~ c(1:kMax))), col=color, lwd=2, lty="dotted") +} +# model_3 +#for(kern in kernels) { +# color=rainbow(length(kernels))[match(kern, kernels)] +# points(1:nrow(model_3$MISCLASS), model_3$MISCLASS[,kern], type='p', pch=15, col=color) +# lines(predict(loess(model_3$MISCLASS[,kern] ~ c(1:kMax))), col=color, lwd=2, lty="dotted") +#} + +# mark the best values of each set of models +model_1_best <- model_1$MISCLASS[model_1$best.parameters$k, model_1$best.parameters$kernel] +model_2_best <- model_2$MISCLASS[model_2$best.parameters$k, model_2$best.parameters$kernel] +points(model_1$best.parameters$k, model_1_best, pch=16, col="black") +points(model_2$best.parameters$k, model_2_best, pch=17, col="black") + +legend("bottomright", ncol=2, legend=c(kernels, paste(kernels,"(red. data)")), + col=rep(rainbow(length(kernels)),2), pch=c(rep(16,3), rep(17,3)), + lwd=2, lty=c(rep("solid",3), rep("dotted",3)), + bty="n", y.intersp=1.5, inset=0.01, cex=0.8) + + +# print out the best parameters +print(paste("Best model_1 parameters:", "kernel =", model_1$best.parameters$kernel, ", k =", model_1$best.parameters$k)) +print(model_1$MISCLASS) +print(paste("Best model_2 parameters:", "kernel =", model_2$best.parameters$kernel, ", k =", model_2$best.parameters$k)) +print(model_2$MISCLASS) +#print(paste("Best model_3 parameters:", "kernel =", model_3$best.parameters$kernel, ", k =", model_3$best.parameters$k)) +#print(model_3$MISCLASS) \ No newline at end of file diff --git a/optimizeKNN.R b/optimizeKNN.R index 645ebaf..a847863 100644 --- a/optimizeKNN.R +++ b/optimizeKNN.R @@ -1,18 +1,18 @@ # Kaggle: Digit Recognizer # https://www.kaggle.com/c/digit-recognizer/data +# optimize a knn model using the FNN implementation -#library(class) # "recommended" k-Nearest Neighbour Classification: TOO SLOW -library(FNN) # Fast Nearest Neighbor Search Algorithms and Applications +# Fast Nearest Neighbor Search Algorithms and Applications +library(FNN) # load the data -rawTrainData <- read.csv("train.csv", header=TRUE)#[1:25000,] -#test <- read.csv("test.csv", header=TRUE)[1:1000,] +rawTrainData <- read.csv("train.csv", header=TRUE)[1:1000,] -# randomly permute row order of training dataset -train <- rawTrainData[sample(nrow(rawTrainData)), ] -# then split up into training and cross-validation sets -cv <- train[(1+(0.6*nrow(train))):nrow(train), ] # 2 needed instead of 1 to avoid overlap row? -train <- train[1:(0.6*nrow(train)), ] +# randomly sample rows of training data +indices <- sample(1:nrow(rawTrainData), trunc(0.6*nrow(rawTrainData))) +# assign training and cross-validation sets to orthogonal sub-sets +train <- rawTrainData[indices, ] +cv <- rawTrainData[-indices, ] # drop label columns for use in KNN trainCl <- train[, 1] @@ -26,12 +26,22 @@ badCols <- nearZeroVar(train) print(paste("Fraction of nearZeroVar columns:", length(badCols)/length(train))) train <- train[, -badCols] cv <- cv[, -badCols] - + +# what about centering and scaling? i.e. standardizing +trainColMeans <- apply(train, 2, mean) +trainColSD <- apply(train, 2, sd) +train <- apply(train, 2, scale, center=TRUE, scale=TRUE) +#cv <- apply(cv, 2, scale, center=trainColMeans, scale=trainColSD) +#train <- apply(train, 2, scale, center=TRUE, scale=TRUE) +#cv <- apply(cv, 2, scale, center=TRUE, scale=TRUE) +cv <- sweep(cv, 2, trainColMeans, FUN="-") +cv <- sweep(cv, 2, trainColSD, FUN="/") + # fit knn model to training set # get knn predictions for training set # compute fractional training error # for numK values of k -numK <- 6 +numK <- 10 trainErrs <- numeric(numK) for(i in 1:numK) { print(paste("Train:", i))