In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#4 - Portland houses
df_full = pd.read_table("portland_houses.txt", sep = ",", names = ["Area","Bedrooms","Price"])
df = df_full[["Area","Price"]]


In [3]:
#4a - normal distribution model + sampling

areas = df['Area']

B_0 = 90000
B_1 = 140
sigma = 65000

def draw_Y(xin):
    eps = np.random.normal(loc=0.0,scale=sigma)
    return B_0 + B_1*xin + eps

Num_samples = 1000
all_samples = []
for i in range(0,Num_samples):
    sample_ys = []
    for area in areas:
        sample_ys.append(draw_Y(area))
    all_samples.append(sample_ys)
    

In [4]:
#4b S_xx

Sxx = 0.
area_mean = np.mean(areas)
for area in areas:
    Sxx += (area - area_mean)**2
Sxx

29051384.212765954

In [5]:
#4c - estimate parameters for each sample

sample_B0s = []
sample_B1s = []

to_fit = np.transpose(np.asarray([np.asarray(areas), np.ones(len(areas))]))

for i in range(0,Num_samples):
    outs = np.linalg.lstsq(to_fit,np.asarray(all_samples[i]).reshape(-1,1))[0]
    sample_B0s.append(outs[1][0])
    sample_B1s.append(outs[0][0])
    

In [6]:
#4e - get sum of square of residues for each sample

SSRs = []

for i in range(0, Num_samples):
    currB0 = sample_B0s[i]
    currB1 = sample_B1s[i]
    currSSR = 0.
    for j in range(0,len(areas)):
        currSSR += (all_samples[i][j] - (currB0 + currB1*areas[j]))**2
    SSRs.append(currSSR)

    
    
SSR_scaled = []
for SSR in SSRs:
    SSR_scaled.append(SSR*1./sigma**2)

np.histogram(SSRs)    

(array([  4,  22, 134, 245, 282, 184,  87,  31,   8,   3], dtype=int64),
 array([  6.42082954e+10,   9.28812093e+10,   1.21554123e+11,
          1.50227037e+11,   1.78899951e+11,   2.07572865e+11,
          2.36245779e+11,   2.64918693e+11,   2.93591607e+11,
          3.22264521e+11,   3.50937434e+11]))

In [7]:
#4f - quantile-quantile plot
from scipy.stats import probplot, chi2

B0_theo_mean = B_0
B0_theo_scale = ((np.dot(areas,areas)*sigma**2)*1./(Sxx*len(areas)))**.5
B0arr = np.asarray(sample_B0s)

print("B_0: Predicted, Experimental (N = " + str(Num_samples) + ")")
print("Mean: " + str(B0_theo_mean) + ", " + str(np.mean(B0arr)))
print("Stdev: " + str(B0_theo_scale) + ", " + str(np.var(B0arr)**.5))



B1_theo_mean = B_1
B1_theo_scale = sigma/((Sxx)**.5)
B1arr = np.asarray(sample_B1s)

print("\nB_1: Predicted, Experimental (N = " + str(Num_samples) + ")")
print("Mean: " + str(B1_theo_mean) + ", " + str(np.mean(B1arr)))
print("Stdev: " + str(B1_theo_scale) + ", " + str(np.var(B1arr)**.5))


SSRarr = np.asarray(SSR_scaled)

print("\nSSR: Predicted, Experimental (N = " + str(Num_samples) + ")")
print("Mean: " + str(len(areas)-2) + ", " + str(np.mean(SSRarr)))
print("Variance: " + str((len(areas)-2)*2) + ", " + str(np.var(SSRarr)))


## non working code to make plots fancier
# plt.title("Scaled B_0 normal probability plot - N = " + str(Num_samples))
# plt.text("Test", 1,1)
plt.clf()
probplot((B0arr-B0_theo_mean)/B0_theo_scale, dist = 'norm', plot = plt, fit = False)
# plt.savefig("B0-probplot-N" + str(Num_samples))
plt.show()

plt.clf()
probplot((B1arr-B1_theo_mean)/B1_theo_scale, plot = plt, fit = False)
# plt.savefig("B1-probplot-N" + str(Num_samples))
plt.show()

plt.clf()
probplot(SSRarr, dist = chi2, sparams = (len(areas)-2.), plot = plt)
# plt.savefig("SSR-QQplot-N" + str(Num_samples))
plt.show()




B_0: Predicted, Experimental (N = 1000)
Mean: 90000, 90263.1512008
Stdev: 25923.3028248, 27150.281981

B_1: Predicted, Experimental (N = 1000)
Mean: 140, 139.84693586
Stdev: 12.0595177647, 12.6396017075

SSR: Predicted, Experimental (N = 1000)
Mean: 45, 45.0829917381
Variance: 90, 92.3086183572


In [8]:
# B_0: Predicted, Experimental (N = 200)
# Mean: 90000, 91214.275027
# Stdev: 25923.3028248, 26216.1848178

# B_1: Predicted, Experimental (N = 200)
# Mean: 140, 139.390008812
# Stdev: 12.0595177647, 12.3368098799

# SSR: Predicted, Experimental (N = 200)
# Mean: 45, 45.0307825641
# Variance: 90, 93.4425402582


# B_0: Predicted, Experimental (N = 1000)
# Mean: 90000, 88526.303228
# Stdev: 25923.3028248, 25365.1801909

# B_1: Predicted, Experimental (N = 1000)
# Mean: 140, 140.577950748
# Stdev: 12.0595177647, 11.7795330282

# SSR: Predicted, Experimental (N = 1000)
# Mean: 45, 44.7957608198
# Variance: 90, 87.3393505881

In [9]:
#5 - Arrhythmia data

#5a - load and clean dataset

arr_names = []
for i in range(0,279):
    arr_names.append("Input_" + str(i))
    
arr_names.append("Output")

dfa = pd.read_table("arrhythmia.data", sep = ",", header = None, names=arr_names, na_values = ["?"])

dfa[:3]

Unnamed: 0,Input_0,Input_1,Input_2,Input_3,Input_4,Input_5,Input_6,Input_7,Input_8,Input_9,...,Input_270,Input_271,Input_272,Input_273,Input_274,Input_275,Input_276,Input_277,Input_278,Output
0,75,0,190,80,91,193,371,174,121,-16,...,0.0,9.0,-0.9,0.0,0.0,0.9,2.9,23.3,49.4,8
1,56,1,165,64,81,174,401,149,39,25,...,0.0,8.5,0.0,0.0,0.0,0.2,2.1,20.4,38.8,6
2,54,0,172,95,138,163,386,185,102,96,...,0.0,9.5,-2.4,0.0,0.0,0.3,3.4,12.3,49.0,10


In [10]:
# for col in df_arr.columns:
#     print(df_arr[col].dtype)

print(dfa['Input_13'][3])
outcount = 0
for i in range(0,len(dfa['Output'])):
    currn = dfa['Output'][i]
    outcount += 1
    if(np.isnan(currn)):
        print(i)
print(outcount)

nan
452


In [11]:
from sklearn.preprocessing import Imputer
#impute mean for integer, float; impute mode otherwise


def categorical_to_int(cat_col):
    int_col = []
    catmap = {}
    currind = 0
    for val in cat_col:
        if val not in catmap.keys():
            catmap[val] = currind
            int_col.append(currind)
            currind += 1
        else:
            int_col.append(catmap[val])
    return int_col

def imputed_version(dfin, excl_cols = []):
    df = dfin.copy()
    
    copyimputed = False
    imp_mean = Imputer(missing_values='NaN', strategy='mean', axis=0,copy=copyimputed)
    imp_mode = Imputer(missing_values='NaN', strategy='most_frequent', axis=0, copy=copyimputed )

    categorical_vars = []
    #will use this list to make dummies for all categorical variables

    for col in df.columns:
        if(col in excl_cols):
            continue
        elif(df[col].dtype == np.float64 or  df[col].dtype == np.int64):
            df[col] = imp_mean.fit_transform(df[col].reshape(-1,1))    
        else:
            categorical_vars.append(col)
            #need to convert to integer column for Imputer to work
            df[col] = categorical_to_int(df[col])
            df[col] = imp_mode.fit_transform(df[col].reshape(-1,1))  
    return df


In [12]:
dfa = imputed_version(dfa, ['Output'])
# print(dfa['Input_13'][3])
# print(np.mean(dfa['Input_13'][:]))

X = np.asarray(dfa[arr_names[:279]])
y = np.asarray(dfa['Output'])

In [13]:
X.shape

(452, 279)

In [14]:
#5b - grid search on knn classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

knc = KNeighborsClassifier(n_jobs = -1)




In [15]:
# n_neighbors = [3,10,30,100]
#best pair was 3, distance
n_neighbors = [3,4,5,7]
#best pair was still 3, distance, with a score of 0.494461222136

weights = ['uniform','distance']




search_grid = {'n_neighbors':n_neighbors, 'weights': weights}


gscv = GridSearchCV(knc,search_grid, cv=2, scoring='f1_weighted')
gscv.fit(X,y)
print(gscv.best_estimator_)
print(gscv.best_score_)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
           weights='distance')
0.494461222136


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [16]:
#5b continued - learning curve for optimal predictor
from sklearn.learning_curve import learning_curve


knc_o = KNeighborsClassifier(n_neighbors = 3, weights = 'distance')
lc_sizes, train_scores, test_scores = learning_curve(knc_o, X, y, cv = 2, scoring = 'f1_weighted')




  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [17]:
train_avg = np.average(train_scores,axis=1)
test_avg = np.average(test_scores,axis=1)



plt.plot(lc_sizes, train_avg, 'go')
plt.plot(lc_sizes, test_avg, 'ro')
plt.title("Arrhythmia - kNN Learning Curve - f1_weighted scoring - all variables")
plt.savefig("knn_learningcurve_allvars")

#we can see that the training error is essentially zero, while test error is quite large
#thus, most error comes from overfitting to the training set, or "variance"

In [18]:
#5c - grid search for logistic regression

from sklearn.linear_model import LogisticRegression
lrg = LogisticRegression()

# C = [.1,1.,3.,10.,30.,100.]
#best was .1, l1

# C = [.001,.01,.1,.2,.3]
#best was .001, l2, with f1 weighted score of .65388

# C = [.0003,.001,.003]
#best was still .001, l2

C = [.0005,.0008,.0012,.0015]
#best was .0005, l2 with score of .65693
# know best local C for l2 is between .0003, .001, and marginal value of tweaking C in this range does not seem high 
penalty = ['l1','l2']



# #check - compare C over broad range to be sure previous inferences were correct
# C = [.0001,.0005,1.,10.,30.]
# penalty = ['l2']





search_gridL = {'C':C, 'penalty': penalty}


gscvL = GridSearchCV(lrg,search_gridL, cv=2, scoring='f1_weighted')
gscvL.fit(X,y)
print(gscvL.best_estimator_)
print(gscvL.best_score_)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


LogisticRegression(C=0.0005, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
0.656931022422


In [19]:
lrg_o = LogisticRegression(C = .0005, penalty = 'l2')

lc_sizes, train_scores, test_scores = learning_curve(lrg_o, X, y, cv = 2, scoring = 'f1_weighted')

train_avg = np.average(train_scores,axis=1)
test_avg = np.average(test_scores,axis=1)

plt.clf()
plt.plot(lc_sizes, train_avg, 'go')
plt.plot(lc_sizes, test_avg, 'ro')
plt.title("Arrhythmia - log reg Learning Curve - f1_weighted scoring")
# plt.show()
plt.savefig("logreg_learningcurve")
#here we can see that error is due to a mix of bias and variance, though variance still dominates. 
#The score of this estimator is significantly better than that of the best kNN classifier
#So I would recommend logistic regression over a k-nearest-neighbors approach which uses all variables

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [20]:
#5d - finding best beatures and running knn on them
#find best features with forward search
from sklearn.cross_validation import cross_val_score



In [21]:
#forward search method
#note: using f1_weighted gave errors here, so I used regular scoring instead
dfa_cols_left = dfa.columns.tolist()
dfa_cols_left.remove('Output')

knc_foropt = KNeighborsClassifier(n_neighbors = 3, weights = 'distance', n_jobs = -1)

# F = []
# best_F = []
# best_fscore = 0.

F = ['Input_90', 'Input_14', 'Input_121', 'Input_98', 'Input_114', 'Input_46', 'Input_223', 'Input_19', 'Input_111', 'Input_75', 'Input_109', 'Input_21', 'Input_102', 'Input_150', 'Input_192', 'Input_170', 'Input_214']
best_F = ['Input_90', 'Input_14', 'Input_121', 'Input_98', 'Input_114', 'Input_46', 'Input_223', 'Input_19', 'Input_111', 'Input_75', 'Input_109', 'Input_21', 'Input_102', 'Input_150', 'Input_192', 'Input_170', 'Input_214']
best_fscore = 0.716803760282

for val in best_F:
    dfa_cols_left.remove(val)

#threshold for value of adding a variable
#set fairly high, since k nearest neighbors (especially with distance weighting) get costlier with increasing dimension
# marg_value_thresh = 0.0001
# marg_value_thresh = .00001
#find that optimal set of variables = 17 inputs, which capture essentially all of the value extractable by a 3NN model
#(the current setup of the below loop verifies this, printing out the value of adding a new input to the optimum I found beforehand)
#(each input either does not increase the score, or decreases it!)
marg_value_thresh = 0.
variable_value_thresh = marg_value_thresh
while(len(dfa_cols_left) > 0):
    print("Just included " + str(len(best_F)) + " variables: best score so far is " + str(best_fscore) + ", with " + str(len(best_F)) + " variables used")
    Ftest = F
    best_score = best_fscore
    best_var = ""
    not_valuable = []
    for col in dfa_cols_left:
        Ftest.append(col)
        currX = np.asarray(dfa[Ftest])
        scores = cross_val_score(knc_foropt, currX, y, cv=2)
        curr_score = np.mean(scores)
        print(col + ": changes score by " + str(curr_score - best_fscore))
        if(curr_score > best_score):
            best_score = curr_score
            best_var = col
        if(curr_score < best_fscore + variable_value_thresh ):
            not_valuable.append(col)
        Ftest.remove(col)
    #remove all columns from consideration which could not improve on the score by more than the variable threshold
    #(apart from the best column, if it is one of these)
    for column in not_valuable:
        if(not(column == best_var)):
            dfa_cols_left.remove(column)
    
    F.append(best_var)
    dfa_cols_left.remove(best_var)
    
    if(best_score > best_fscore+marg_value_thresh):
        best_fscore = best_score
        best_F = F
    else:
        break
        
print(best_F)
print(best_fscore)

# ['Input_90', 'Input_14', 'Input_121', 'Input_98', 'Input_114', 'Input_46', 'Input_223', 'Input_19']
# 0.639306698002

# ['Input_90', 'Input_14', 'Input_121', 'Input_98', 'Input_114', 'Input_46', 'Input_223', 'Input_19', 'Input_111', 'Input_75', 'Input_109', 'Input_21']
# 0.685663924794

# ['Input_90', 'Input_14', 'Input_121', 'Input_98', 'Input_114', 'Input_46', 'Input_223', 'Input_19', 'Input_111', 'Input_75', 'Input_109', 'Input_21', 'Input_102', 'Input_150', 'Input_192', 'Input_170']
# 0.714629847239

#marginal increase after this was relatively small; adding Input_214 added ~.0022 to score, adding further variables apparently did not increase score
#(since variable-additions were not accepted even with threshold = 0.)


# ['Input_90', 'Input_14', 'Input_121', 'Input_98', 'Input_114', 'Input_46', 'Input_223', 'Input_19', 'Input_111', 'Input_75', 'Input_109', 'Input_21', 'Input_102', 'Input_150', 'Input_192', 'Input_170', 'Input_214']
# 0.716803760282

Just included 17 variables: best score so far is 0.716803760282, with 17 variables used
Input_0: changes score by -0.119506462985
Input_1: changes score by -0.0397571484528
Input_2: changes score by -0.0862710536623
Input_3: changes score by -0.112749706228
Input_4: changes score by -0.042009400705
Input_5: changes score by -0.131844888367
Input_6: changes score by -0.0862710536623
Input_7: changes score by -0.0993145319232
Input_8: changes score by -0.0772620446533
Input_9: changes score by -0.147610654132
Input_10: changes score by -0.0865844104974
Input_11: changes score by -0.101723462593
Input_12: changes score by -0.141402271837
Input_13: changes score by -0.0503916960438
Input_15: changes score by -0.0482961222091
Input_16: changes score by -0.0972972972973
Input_17: changes score by -0.0971406188797
Input_18: changes score by -0.00225225225223
Input_20: changes score by -0.108401880141
Input_22: changes score by 2.12052597703e-14
Input_23: changes score by 2.12052597703e-14
Inp

ValueError: list.remove(x): x not in list

In [22]:
F_touse = ['Input_90', 'Input_14', 'Input_121', 'Input_98', 'Input_114', 'Input_46', 'Input_223', 'Input_19', 'Input_111', 'Input_75', 'Input_109', 'Input_21', 'Input_102', 'Input_150', 'Input_192', 'Input_170', 'Input_214']
X_best = np.asarray(dfa[F_touse])

# n_neighbors = [3,10,30,100]
# best was 3,distance, with a score of .6629
#(note: when only uniform was tested, n=3 was still best, but score was .5855)
#(similarly, when standard scoring rather than f1_weighted was used, 3,distance was still optimal)

# n_neighbors = [1,2,3,5,8]
#best was still 3, distance

n_neighbors = [3,4,6,7]
#again, 3,distance best
weights = ['uniform','distance']


search_grid = {'n_neighbors':n_neighbors, 'weights': weights}

gscv = GridSearchCV(knc,search_grid, cv=2, scoring = 'f1_weighted')
gscv.fit(X_best,y)
print(gscv.best_estimator_)
print(gscv.best_score_)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
           weights='distance')
0.66291524946


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [23]:
lc_sizes, train_scores, test_scores = learning_curve(knc_o, X_best, y, cv = 2, scoring = 'f1_weighted')
train_avg = np.average(train_scores,axis=1)
test_avg = np.average(test_scores,axis=1)

nvarsused = str(len(best_F))

plt.clf()
plt.plot(lc_sizes, train_avg, 'go')
plt.plot(lc_sizes, test_avg, 'ro')
plt.title("Arrhythmia - kNN Learning Curve - f1_weighted scoring - " + nvarsused + " variables")
plt.savefig("knn_learningcurve_" + nvarsused + "vars")



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [24]:
#we can see that the reduced-variable kNN model has some bias (score on training set is non-negligibly smaller than 1)
#however, variance (difference between training and test scores) still predominates
#final training + test scores look very similar to those of logistic regression, though test score is slightly better

In [25]:
#Summary of results (average score for 2-fold cross-validation)
#initial kNN (k=3): .4945
#logistic regression: .6569
#reduced-variable kNN (k=3): .6629
#since the reduced-variable kNN scores slightly better, I would recommend it as a classifier.
#However, in general, the logistic regression seems somewhat safer for low-effort analyses
#since the kNN model requires some costly optimization over variables (and k) to beat it, and takes more time to predict as well

In [26]:
#5e
#variance of score as k increases
#stratified k-fold throws errors if more than 2-fold cross-validation is used, since there are few instances of some output classes
#as a result, I will use unstratified 5-fold cross-validations and take the variance of the results
from sklearn.cross_validation import KFold, StratifiedKFold

In [30]:
variances = []
nrows_arr = len(y)

n_fold = 3
n_fold_strat = 2
# ks = [1,2,3,4,5,8,10,15,20,30,100,200]
ks = np.asarray(range(1,30)).tolist()
ks += [40,45,50,55,60,65,70,75,80,90,100,200]

loops_per = 10
for k in ks:
    knc_k = KNeighborsClassifier(n_neighbors = k, weights = 'distance', n_jobs = -1)
    k_scores = []
    subvars = []
    for loop in range(0,loops_per):
#         cv_curr = KFold(nrows_arr, n_folds = n_fold, shuffle = True)
        cv_curr = KFold(nrows_arr, n_folds = n_fold_strat, shuffle = True)
        curr_scores = cross_val_score(knc_k, X_best, y, cv=cv_curr)
        subvars.append(np.var(curr_scores))
        k_scores += curr_scores.tolist()
    k_var = np.var(k_scores)
    variances.append(k_var)
    print(str(k) + ": " + str(k_var) + "; std of subvariances is " + str(np.var(subvars)**.5))
plt.clf()
plt.plot(ks,variances,'b-')
plt.title("Variance of " + str(3) + "-fold cross-validation of reduced-input kNN model vs k")
plt.xlabel("k")
plt.ylabel("Cross-validation Variance")
plt.show()

1: 0.00101241287493; std of subvariances is 0.000689929282184
2: 0.000618490093194; std of subvariances is 0.000415941858886
3: 0.00151029837889; std of subvariances is 0.000702121138823
4: 0.000560879865299; std of subvariances is 0.000181050409115
5: 0.000651969613909; std of subvariances is 0.000420888977505
6: 0.000622405826611; std of subvariances is 0.000638813834531
7: 0.000481635210275; std of subvariances is 0.000408596509453
8: 0.000495340277234; std of subvariances is 0.000340640675837
9: 0.000477474743519; std of subvariances is 0.000334291407312
10: 0.000697930534889; std of subvariances is 0.00073543585784
11: 0.000952453206986; std of subvariances is 0.00139435517577
12: 0.000656815334012; std of subvariances is 0.000485831368566
13: 0.000970074007362; std of subvariances is 0.00064151157889
14: 0.000782118803352; std of subvariances is 0.000722421145296
15: 0.000358828020988; std of subvariances is 0.00041215332212
16: 0.000807571070562; std of subvariances is 0.0009800

In [94]:
plt.clf()
plt.plot(ks,variances,'b-')
plt.title("Variance of " + str(3) + "-fold cross-validation of reduced-input kNN model vs k")
plt.xlabel("k")
plt.ylabel("Cross-validation Variance")
plt.show()

In [31]:
X_best.shape

(452, 17)