# This is how to get the data in a fast way

In [None]:
import numpy as np
import pandas as pd
import csv

def get_instances_from_csv(datapath, train_or_eval):
	"""
	Input: datapath: datapath to be read from
	Output: list with first element np array with features, second element np array with respective class labels
	"""
	features = np.zeros(6)
	class_labels = np.zeros(1)
	
	df = pd.read_csv(datapath, sep=',',header=None)[1:]
	df = df.sample(frac=1).reset_index(drop=True)
	if train_or_eval == "train":
		features = df.values[1:500000, 0:6]
		class_labels = df.values[1:500000, 6]
	elif train_or_eval == "eval":
		features = df.values[500000:, 0:6]
		class_labels = df.values[500000:, 6]
	else:
		features = df.values[1:, 0:6]
		class_labels = df.values[1:, 6]
	
	return [features.astype(np.float), class_labels.astype(np.int)]

# Getting Training/Test Set for Crossvalidation

In [None]:
def get_cross_validation_file_indices(cross_validation_runs):
	init = [e for e in range(10)]
	training_eval = []
	for i in range(cross_validation_runs):
		init_temp = init[:]
		eval = init_temp[i]
		del init_temp[i]
		training = init_temp
		training_eval.append([training, eval])
	return training_eval
	
def get_training_eval_set(training_eval_index):
	"""
	Input: Number of files for training set
	Output: all training instances
	"""
	instances = [np.array([], dtype=np.float32), np.array([], dtype=np.float32)]
	for i in training_eval_index[0]:
		DATAPATH = "../sub_datasets/subset_"+str(i)+".csv"
		file_instances = get_instances_from_csv(DATAPATH, "all", False)	
		if instances[0].size == 0 and instances[1].size == 0:
			instances_train = file_instances
		else:
			instances_train = [np.vstack((instances[0], file_instances[0])), np.append(instances[1], file_instances[1])]
	
	DATAPATH = "../sub_datasets/subset_"+str(training_eval_index[1])+".csv"
	instances_eval = get_instances_from_csv(DATAPATH, "all", False)
	
	return [instances_train, instances_eval]

# KNN

In [6]:
import warnings
from sklearn.utils import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from startEvaluation import *
from getData import *

cross_validation_runs = 10
xNN = 3

training_eval_index = get_cross_validation_file_indices(cross_validation_runs)

mean_accuracy = 0
confusion_matrix = 0
for i in range(cross_validation_runs):
	training_eval_instances = get_training_eval_set(training_eval_index[i])
	training_instances = training_eval_instances[0]
	x = training_instances[0]
	y = training_instances[1]
	
	model = KNeighborsClassifier(n_neighbors=xNN)
	model.fit(x, y)

	eval_instances = training_eval_instances[1]
	eval_x = eval_instances[0]
	eval_y = eval_instances[1]
	predicted = model.predict(eval_x)

	test_eval = evaluation(eval_y.ravel(), predicted)
	mean_accuracy += test_eval.get_accuracy()
	confusion_matrix += test_eval.get_conf_matrix()
mean_accuracy /= cross_validation_runs
print("mean_accuracy: ", mean_accuracy)
print("confusion_matrix: ", confusion_matrix)

mean_accuracy:  0.690443354922
confusion_matrix:  [[ 443651    6098   21499  102999   37575   30536  642358]
 [   1378 1040850     792    1547     619     253 1045439]
 [   9045    1053 1007032    6758    2692    2112 1028692]
 [ 156727    6524   27178  754470  139315  124798 1209012]
 [ 104552    3895   19061  268989  298120   99577  794194]
 [  99558    2606   16898  271332  119209  214536  724139]
 [ 814911 1061026 1092460 1406095  597530  471812 5443834]]


  exec(code_obj, self.user_global_ns, self.user_ns)


# Naive Bayes

In [4]:
import warnings
from sklearn.utils import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
from sklearn.naive_bayes import GaussianNB
from startEvaluation import *
from getData import *

def onlineNaiveBayes(batch_size):	
	j = 0
	while True:
		if j == 0:
			model.partial_fit(x[j:j+batch_size], y[j:j+batch_size], y)
		else:
			try:
				model.partial_fit(x[j:j+batch_size], y[j:j+batch_size])
			except:
				break
		j += batch_size

def offlineNaiveBayes():
	model.fit(x, y.ravel())

cross_validation_runs = 10
batch_size = 100

training_eval_index = get_cross_validation_file_indices(cross_validation_runs)

mean_accuracy = 0
confusion_matrix = 0
for i in range(cross_validation_runs):
	training_eval_instances = get_training_eval_set(training_eval_index[i])
	training_instances = training_eval_instances[0]
	x = training_instances[0]
	y = training_instances[1]

	model = GaussianNB()
	#onlineNaiveBayes(batch_size)
	offlineNaiveBayes()

	eval_instances = training_eval_instances[1]
	eval_x = eval_instances[0]
	eval_y = eval_instances[1]
	predicted = model.predict(eval_x)

	test_eval = evaluation(eval_y.ravel(), predicted)
	mean_accuracy += test_eval.get_accuracy()
	confusion_matrix += test_eval.get_conf_matrix()
mean_accuracy /= cross_validation_runs
print("mean_accuracy: ", mean_accuracy)
print("confusion_matrix: ", confusion_matrix)

mean_accuracy:  0.467830646393
confusion_matrix:  [[ 174116   55085  128905  210416   51633   22203  642358]
 [  25166  692779  283823     210    5132   38331 1045441]
 [  24501   74560  896430   24020    5968    3211 1028690]
 [  70017  191574  237291  514707  121558   73865 1209012]
 [  41800   64163  150101  313976  165689   58463  794192]
 [  50802   77630  115821  275309  101505  103074  724141]
 [ 386402 1155791 1812371 1338638  451485  299147 5443834]]


  exec(code_obj, self.user_global_ns, self.user_ns)
