In [1]:
import numpy as np

import mltools as ml
import sys
sys.path.append('code')

import matplotlib.pyplot as plt 
from sklearn import preprocessing
from tqdm import tqdm

In [2]:
np.random.seed(0)
X = np.genfromtxt('data/X_train.txt', delimiter=',')
X=preprocessing.normalize(X,norm="max",axis=0)
Y = np.genfromtxt('data/Y_train.txt', delimiter=',')
X,Y = ml.shuffleData(X,Y)
data = Xtr,Xte,Ytr,Yte = ml.splitData(X,Y,0.75) #75%training and 25% testing

In [3]:
best_feature_pool = []
out_file = open("feature_select/selected_feature_normalized.txt",'w')
score_file = open("feature_select/feature_score_normalized.txt",'w')

In [4]:
feature_pool = np.array([None]*X.shape[1])
unselected_features = [i for i in range(Xtr.shape[1])]
for feature in tqdm(range(Xtr.shape[1])):
    knn = ml.knn.knnClassify()
    knn.train(Xtr[:,feature].reshape(Xtr.shape[0],1),Ytr)
    knn.k = 1
    feature_pool[feature] = [feature,knn.auc(Xte[:,feature].reshape(Xte.shape[0],1),Yte)]
best_feature_pool.append(max(feature_pool,key=lambda a:a[1])[0])
out_file.write(str(best_feature_pool[-1])+"\n")
out_file.flush()
score_file.write(str(best_feature_pool) + " , " +str(max(feature_pool,key=lambda a:a[1])[1])+"\n")
score_file.flush()
unselected_features.remove(best_feature_pool[-1])

100%|██████████| 107/107 [00:59<00:00,  1.79it/s]


In [None]:
for i in range(106):
    print("now selecting #{} best features".format(i+2))
    feature_pool = []
    for feature in tqdm(unselected_features):
        knn = ml.knn.knnClassify()
        knn.train(Xtr[:,best_feature_pool+[feature]],Ytr)
        knn.k = 1
        feature_pool.append([feature,knn.auc(Xte[:,best_feature_pool+[feature]],Yte)])
    best_feature_pool.append(max(feature_pool,key=lambda a:a[1])[0])
    print("current best score: ",max(feature_pool,key=lambda a:a[1])[1])
    out_file.write(str(best_feature_pool[-1])+"\n")
    out_file.flush()
    score_file.write(str(best_feature_pool) + " , " +str(max(feature_pool,key=lambda a:a[1])[1])+"\n")
    score_file.flush()
    unselected_features.remove(best_feature_pool[-1])

  0%|          | 0/106 [00:00<?, ?it/s]

now selecting #2 best features


100%|██████████| 106/106 [01:35<00:00,  1.11it/s]
  0%|          | 0/105 [00:00<?, ?it/s]

current best score:  0.6276256277760037
now selecting #3 best features


  9%|▊         | 9/105 [00:10<01:50,  1.15s/it]

In [None]:
out_file.close()
score_file.close()

In [None]:
sorted_features = []
for line in open('feature_select/feature_score_normalized.txt'):
    sorted_features.append([eval(i) for i in line.split(sep=",")])

In [None]:
score = [i[-1] for i in sorted_features]

In [None]:
plt.plot(range(1,108),score,label="testing auc")
plt.title("Number of features used vs auc score with normalize")
plt.legend()
plt.xlabel("number of features used")
plt.ylabel("auc score")
plt.savefig('knn_output/feature_selection_normalized.png')
plt.show()

In [None]:
score_i = [i for i in enumerate(score)]

In [None]:
print("the optimal number of features would be {}".format(max(score_i,key=lambda a:a[1])))

In [None]:
best_features = sorted_features[29][:-1]

In [None]:
naive_knn = ml.knn.knnClassify()
naive_knn.train(Xtr[:,best_features],Ytr)
knn_tr_auc = []
knn_te_auc = []
k_values = [2**i for i in range(10)]
for k in tqdm(k_values):
    naive_knn.K = k
    knn_tr_auc.append(naive_knn.auc(Xtr[:,best_features],Ytr))
    knn_te_auc.append(naive_knn.auc(Xte[:,best_features],Yte))

In [None]:
plt.plot(range(10),knn_tr_auc, label= "training auc")
plt.plot(range(10),knn_te_auc, label= "testing auc")
plt.title("k value vs error rate on selected features with normalization")
plt.xlabel("K value = 2**x")
plt.ylabel("auc")
plt.savefig('selected_feature_error_normalize.png')
plt.legend()
plt.show()

In [None]:
max(enumerate(knn_te_auc),key=lambda a:a[1])

In [None]:
def generating_output(learner,filename ="output",feature_space = [i for i in range(X.shape[1])], normalize=True):
    Xte = np.genfromtxt('data/X_test.txt', delimiter=',')
    Xte = Xte[:,feature_space]
    if normalize:
        print("normalizing data")
        Xte = preprocessing.normalize(Xte,norm="max",axis=0)
    Yte = np.vstack((np.arange(Xte.shape[0]), learner.predictSoft(Xte)[:,1])).T
    np.savetxt("{}.txt".format(filename),Yte,'%d, %.2f',header='Id,Predicted',comments='',delimiter=',')

In [None]:
# naive_knn.K=16

In [None]:
# naive_knn.auc(Xte[:,best_features],Yte)

In [None]:
X