In [0]:
import numpy as np
from scipy.io import arff
from io import StringIO
import sklearn
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.metrics import make_scorer


In [0]:
def read(a):
  f = open(a,"r")
  c = StringIO(f.read())
  data, meta = arff.loadarff(c)
  return data

In [0]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        j = 0
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            if j > 1:
                break
            j+=1


def remove_field(data, name):
	names = list(data.dtype.names)
	if name in names:
		names.remove(name)
	return data[names]


def randomCV(clf, X, y, param_grid, n_iter, cv):
	random_search = RandomizedSearchCV(clf, param_distributions = param_grid,
					n_iter = n_iter, cv = cv, iid = False, n_jobs = -1)
	random_search.fit(X, y)
	report(random_search.cv_results_)
	return random_search.best_params_	

def KNN(X, y):
	
	neigh = KNeighborsClassifier()
	param_grid = {
		"n_neighbors" : np.arange(1,20),
		"algorithm" : ['auto', 'ball_tree', 'kd_tree', 'brute'],
		"weights" : ['uniform', 'distance'],
		"leaf_size" : np.arange(1,60)
	}
	return randomCV(neigh, X, y, param_grid, 400, 6)

def SVM(X, y):

#        C_grid = [0.1, 1, 10]
#        gamma_grid = np.logspace(-2, 1, 4)[0:3]
#        svm_C = svm.SVC(kernel='poly')
#        param_grid = { 'C' : C_grid, 'gamma' : gamma_grid, "kernel" : ['poly', 'rbf', 'sigmoid'], }
#        gridcv = GridSearchCV(svm_C, param_grid, verbose=1, cv=3)
#        gridcv.fit(X, y)
#        print("best parameters:", gridcv.best_params_)
#        print("%.1f%% accuracy on validation sets (average)" % (gridcv.best_score_*100))

	svm_C = svm.SVC()
	param_grid = {
		"kernel" : ['linear', 'rbf', 'sigmoid'],
		"gamma" : ['scale', 'auto'],
		"degree" : np.arange(10),
		"coef0" : np.random.rand(60)*10,
		"shrinking" : [False, True],
		"decision_function_shape" : ['ovo','ovr']
	}
	return randomCV(svm_C, X, y, param_grid, 4, 6)

def DT(X, y):
	dt = DecisionTreeClassifier()
	param_grid = {
		"criterion" : ['gini', 'entropy'],
		"splitter" : ['best', 'random'],
		"min_samples_split" : np.random.random_sample((100,)),
		"max_features" : ['auto', 'sqrt', 'log2', None],
		"class_weight" : [None, 'balanced'],
		"presort" : [True, False],
		"min_samples_leaf" : np.arange(1,6)
	}
	return randomCV(dt, X, y, param_grid, 400, 6)

def RF(X, y):
	rf = RandomForestClassifier()
	param_grid = {
		"n_estimators" : [10*x for x in np.arange(1,50)],
		"criterion" : ['gini', 'entropy'],
		"min_samples_split" : np.random.random_sample((100,)),
		"max_features" : ['auto', 'sqrt', 'log2', None],
#		"class_weight" : [None, 'balanced'],
		"min_samples_leaf" : np.arange(1,6),
#		"bootstrap" : [True, False],
#		"oob_score" : [True, False],
		"warm_start" : [True, False],
	}
	return randomCV(rf, X, y, param_grid, 40, 6)

def Ada(X, y):
	ada = AdaBoostClassifier(algorithm = "SAMME")
	param_grid = {
#		"base_estimator" : ['classes', 'n_classes_', None],
		"n_estimators" : [10*x for x in np.arange(1,50)]
#		"learning_rate" : [10*x for x in np.random.random_sample((100,))]
#		"algorithm" : ['SAMME']
	}
	return randomCV(ada, X, y, param_grid, 40, 6)

def LR(X, y):
	lr = LogisticRegression()
	param_grid = {
		"penalty" : ['l1', 'l2'],
#		"dual" : [True, False],
		"C" : np.random.rand(60),
		"fit_intercept" : [True, False],
		"warm_start" : [True, False],
		"multi_class" : ['ovr', 'auto'],
		"solver" : [ 'liblinear']
	}
	return randomCV(lr, X, y, param_grid, 400, 6)

def GNB(X, y):
	gnb = GaussianNB()
	param_grid = {
		"var_smoothing" : np.random.random_sample((100,))
	}
	return randomCV(gnb, X, y, param_grid, 100, 6)

def NN(X, y):
	nn = MLPClassifier()
	param_grid = {
		"hidden_layer_sizes" : np.arange(2,200),
		"activation" : ['identity', 'logistic', 'tanh', 'relu'],
		"solver" : ['lbfgs', 'sgd', 'adam'],
#		"verbose" : [True, False],
		"warm_start" : [False, True]
	}
	return randomCV(nn, X, y, param_grid, 200, 6)


In [0]:

data = read('9.arff')

np.random.shuffle(data)

y = data[data.dtype.names[-1]]														# Separate target variable
X = remove_field(data, data.dtype.names[-1])

le = preprocessing.LabelEncoder()													# Preprocessing

x = np.empty_like(X[X.dtype.names[1]], dtype = 'float64')

for i in X.dtype.names:
	if X[i].dtype != np.float64:
		X[i] = le.fit_transform(X[i])
		x = np.vstack((x, X[i].astype(np.float64)))
	else:
		x = np.vstack((x, X[i]))

x = x[1:].T

x_test, x_train = np.split(x, [70])
y_test, y_train = np.split(y, [70])

scaler = StandardScaler()                         # scaling features
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [36]:
# --------------------> KNN

param = KNN(x_train,y_train)
knn_c = KNeighborsClassifier().set_params(**param)
knn_c.fit(x_train, y_train)

print("Score with test data",knn_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'weights': 'uniform', 'n_neighbors': 9, 'leaf_size': 12, 'algorithm': 'brute'}

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'weights': 'uniform', 'n_neighbors': 8, 'leaf_size': 6, 'algorithm': 'ball_tree'}

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'weights': 'uniform', 'n_neighbors': 13, 'leaf_size': 16, 'algorithm': 'ball_tree'}

Score with test data 0.8714285714285714


In [37]:
# --------------------> Decision Tree

param = DT(x_train,y_train)
dt_c = DecisionTreeClassifier().set_params(**param)
dt_c.fit(x_train, y_train)

print("Score with test data",dt_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 0.853 (std: 0.012)
Parameters: {'splitter': 'random', 'presort': True, 'min_samples_split': 0.25827862455637507, 'min_samples_leaf': 3, 'max_features': 'auto', 'criterion': 'gini', 'class_weight': None}

Model with rank: 2
Mean validation score: 0.850 (std: 0.011)
Parameters: {'splitter': 'best', 'presort': False, 'min_samples_split': 0.37032848533129137, 'min_samples_leaf': 1, 'max_features': 'log2', 'criterion': 'entropy', 'class_weight': None}

Model with rank: 3
Mean validation score: 0.850 (std: 0.002)
Parameters: {'splitter': 'best', 'presort': True, 'min_samples_split': 0.8119781696359616, 'min_samples_leaf': 1, 'max_features': 'log2', 'criterion': 'entropy', 'class_weight': None}

Model with rank: 3
Mean validation score: 0.850 (std: 0.002)
Parameters: {'splitter': 'best', 'presort': False, 'min_samples_split': 0.15757383924304402, 'min_samples_leaf': 4, 'max_features': 'log2', 'criterion': 'entropy', 'class_weight': None}

Model with r

In [38]:
# --------------------> Random Forest

param = RF(x_train,y_train)
rf_c = RandomForestClassifier().set_params(**param)
rf_c.fit(x_train, y_train)

print("Score with test data",rf_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'warm_start': False, 'n_estimators': 390, 'min_samples_split': 0.22946752813048943, 'min_samples_leaf': 3, 'max_features': 'auto', 'criterion': 'entropy'}

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'warm_start': False, 'n_estimators': 70, 'min_samples_split': 0.47269581735621313, 'min_samples_leaf': 3, 'max_features': 'auto', 'criterion': 'entropy'}

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'warm_start': True, 'n_estimators': 50, 'min_samples_split': 0.0400753388780859, 'min_samples_leaf': 4, 'max_features': 'auto', 'criterion': 'entropy'}

Score with test data 0.8714285714285714


In [39]:
# --------------------> Adaboost

param = Ada(x_train,y_train)
ada_c = AdaBoostClassifier().set_params(**param)
ada_c.fit(x_train, y_train)

print("Score with test data",ada_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 0.845 (std: 0.006)
Parameters: {'n_estimators': 10}

Model with rank: 2
Mean validation score: 0.840 (std: 0.015)
Parameters: {'n_estimators': 70}

Model with rank: 2
Mean validation score: 0.840 (std: 0.015)
Parameters: {'n_estimators': 60}

Score with test data 0.8285714285714286


In [40]:
# ---------------------> Logistic regression

param = LR(x_train,y_train)
lr_c = LogisticRegression().set_params(**param)
lr_c.fit(x_train, y_train)

print("Score with test data",lr_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'warm_start': True, 'solver': 'liblinear', 'penalty': 'l1', 'multi_class': 'ovr', 'fit_intercept': True, 'C': 0.10341090010073939}

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'warm_start': True, 'solver': 'liblinear', 'penalty': 'l1', 'multi_class': 'auto', 'fit_intercept': True, 'C': 0.19092318858371737}

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'warm_start': True, 'solver': 'liblinear', 'penalty': 'l1', 'multi_class': 'auto', 'fit_intercept': True, 'C': 0.07972252148363357}

Score with test data 0.8714285714285714


In [41]:
# ---------------------> Gaussian NB

param = GNB(x_train,y_train)
gnb_c = GaussianNB().set_params(**param)
gnb_c.fit(x_train, y_train)

print("Score with test data",gnb_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 0.805 (std: 0.030)
Parameters: {'var_smoothing': 0.9877278745637731}

Model with rank: 1
Mean validation score: 0.805 (std: 0.030)
Parameters: {'var_smoothing': 0.9821536153589023}

Model with rank: 3
Mean validation score: 0.805 (std: 0.029)
Parameters: {'var_smoothing': 0.9646156288740352}

Model with rank: 3
Mean validation score: 0.805 (std: 0.029)
Parameters: {'var_smoothing': 0.9748647925118005}

Model with rank: 3
Mean validation score: 0.805 (std: 0.029)
Parameters: {'var_smoothing': 0.9708632605710694}

Score with test data 0.8428571428571429


In [42]:
# ---------------------> Neural Network

param = NN(x_train,y_train)
nn_c = MLPClassifier().set_params(**param)
nn_c.fit(x_train, y_train)

print("Score with test data",nn_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 0.850 (std: 0.007)
Parameters: {'warm_start': False, 'solver': 'sgd', 'hidden_layer_sizes': 21, 'activation': 'relu'}

Model with rank: 2
Mean validation score: 0.850 (std: 0.008)
Parameters: {'warm_start': False, 'solver': 'sgd', 'hidden_layer_sizes': 14, 'activation': 'relu'}

Model with rank: 3
Mean validation score: 0.848 (std: 0.004)
Parameters: {'warm_start': False, 'solver': 'sgd', 'hidden_layer_sizes': 47, 'activation': 'logistic'}

Model with rank: 3
Mean validation score: 0.848 (std: 0.004)
Parameters: {'warm_start': True, 'solver': 'sgd', 'hidden_layer_sizes': 182, 'activation': 'relu'}

Model with rank: 3
Mean validation score: 0.848 (std: 0.004)
Parameters: {'warm_start': True, 'solver': 'sgd', 'hidden_layer_sizes': 138, 'activation': 'tanh'}

Score with test data 0.8714285714285714


In [43]:

## --------------------> SVM

param = SVM(x_train,y_train)
svm_c = svm.SVC().set_params(**param)
svm_c.fit(x_train, y_train)

print("Score with test data",svm_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'shrinking': False, 'kernel': 'sigmoid', 'gamma': 'auto', 'degree': 5, 'decision_function_shape': 'ovo', 'coef0': 8.380094381471944}

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'shrinking': False, 'kernel': 'sigmoid', 'gamma': 'scale', 'degree': 2, 'decision_function_shape': 'ovo', 'coef0': 4.234195903371953}

Model with rank: 1
Mean validation score: 0.848 (std: 0.004)
Parameters: {'shrinking': True, 'kernel': 'sigmoid', 'gamma': 'auto', 'degree': 5, 'decision_function_shape': 'ovo', 'coef0': 2.881966300202489}

Score with test data 0.8714285714285714
