In [0]:
import numpy as np
from scipy.io import arff
from io import StringIO
import sklearn
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.metrics import make_scorer
import time

In [0]:
def read(a):
  f = open(a,"r")
  c = StringIO(f.read())
  return np.loadtxt(c)

In [0]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        j = 0
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            if j > 1:
                break
            j+=1


def randomCV(clf, X, y, param_grid, n_iter, cv):
	random_search = RandomizedSearchCV(clf, param_distributions = param_grid,
					n_iter = n_iter, cv = cv, iid = False, n_jobs = -1)
	random_search.fit(X, y)
	report(random_search.cv_results_)
	return random_search.best_params_		

def KNN(X, y):
	
	neigh = KNeighborsClassifier()
	param_grid = {
		"n_neighbors" : np.arange(1,20),
		"algorithm" : ['auto', 'ball_tree', 'kd_tree', 'brute'],
		"weights" : ['uniform', 'distance'],
		"leaf_size" : np.arange(1,60)
	}
	return randomCV(neigh, X, y, param_grid, 400, 6)

def SVM(X, y):

#        C_grid = [0.1, 1, 10]
#        gamma_grid = np.logspace(-2, 1, 4)[0:3]
#        svm_C = svm.SVC(kernel='poly')
#        param_grid = { 'C' : C_grid, 'gamma' : gamma_grid, "kernel" : ['poly', 'rbf', 'sigmoid'], }
#        gridcv = GridSearchCV(svm_C, param_grid, verbose=1, cv=3)
#        gridcv.fit(X, y)
#        print("best parameters:", gridcv.best_params_)
#        print("%.1f%% accuracy on validation sets (average)" % (gridcv.best_score_*100))

	svm_C = svm.SVC()
	param_grid = {
		"kernel" : ['linear', 'rbf', 'sigmoid'],
		"gamma" : ['scale', 'auto'],
		"degree" : np.arange(10),
		"coef0" : np.random.rand(60)*10,
		"shrinking" : [False, True],
		"decision_function_shape" : ['ovo','ovr']
	}
	return randomCV(svm_C, X, y, param_grid, 4, 6)

def DT(X, y):
	dt = DecisionTreeClassifier()
	param_grid = {
		"criterion" : ['gini', 'entropy'],
		"splitter" : ['best', 'random'],
		"min_samples_split" : np.random.random_sample((100,)),
		"max_features" : ['auto', 'sqrt', 'log2', None],
		"class_weight" : [None, 'balanced'],
		"presort" : [True, False],
		"min_samples_leaf" : np.arange(1,6)
	}
	return randomCV(dt, X, y, param_grid, 400, 6)

def RF(X, y):
	rf = RandomForestClassifier()
	param_grid = {
		"n_estimators" : [10*x for x in np.arange(1,50)],
		"criterion" : ['gini', 'entropy'],
		"min_samples_split" : np.random.random_sample((100,)),
		"max_features" : ['auto', 'sqrt', 'log2', None],
#		"class_weight" : [None, 'balanced'],
		"min_samples_leaf" : np.arange(1,6),
#		"bootstrap" : [True, False],
#		"oob_score" : [True, False],
		"warm_start" : [True, False],
	}
	return randomCV(rf, X, y, param_grid, 40, 6)

def Ada(X, y):
	ada = AdaBoostClassifier(algorithm = "SAMME")
	param_grid = {
#		"base_estimator" : ['classes', 'n_classes_', None],
		"n_estimators" : [10*x for x in np.arange(1,50)]
#		"learning_rate" : [10*x for x in np.random.random_sample((100,))]
#		"algorithm" : ['SAMME']
	}
	return randomCV(ada, X, y, param_grid, 40, 6)

def LR(X, y):
	lr = LogisticRegression()
	param_grid = {
		"penalty" : ['l1', 'l2'],
#		"dual" : [True, False],
		"C" : np.random.rand(60),
		"fit_intercept" : [True, False],
		"warm_start" : [True, False],
		"multi_class" : ['ovr', 'auto'],
		"solver" : [ 'liblinear']
	}
	return randomCV(lr, X, y, param_grid, 400, 6)

def GNB(X, y):
	gnb = GaussianNB()
	param_grid = {
		"var_smoothing" : np.random.random_sample((100,))
	}
	return randomCV(gnb, X, y, param_grid, 100, 6)

def NN(X, y):
	nn = MLPClassifier()
	param_grid = {
		"hidden_layer_sizes" : np.arange(2,200),
		"activation" : ['identity', 'logistic', 'tanh', 'relu'],
		"solver" : ['lbfgs', 'sgd', 'adam'],
#		"verbose" : [True, False],
		"warm_start" : [False, True]
	}
	return randomCV(nn, X, y, param_grid, 200, 6)


In [0]:
data = read('Faults.NNA')
print(data.shape)

np.random.shuffle(data)

y = data[:,-1]
x = data[:,0:data.shape[1] -1]

x_test, x_train = np.split(x, [290])
y_test, y_train = np.split(y, [290])

scaler = StandardScaler()                         # scaling features
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

(1941, 34)


In [0]:
# --------------------> KNN

param = KNN(x_train,y_train)
knn_c = KNeighborsClassifier().set_params(**param)
knn_c.fit(x_train, y_train)

print("Score with test data",knn_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 0.988 (std: 0.004)
Parameters: {'weights': 'distance', 'n_neighbors': 4, 'leaf_size': 38, 'algorithm': 'auto'}

Model with rank: 1
Mean validation score: 0.988 (std: 0.004)
Parameters: {'weights': 'distance', 'n_neighbors': 4, 'leaf_size': 17, 'algorithm': 'kd_tree'}

Model with rank: 1
Mean validation score: 0.988 (std: 0.004)
Parameters: {'weights': 'distance', 'n_neighbors': 4, 'leaf_size': 56, 'algorithm': 'brute'}

Score with test data 0.993103448275862


In [0]:
# --------------------> Decision Tree

param = DT(x_train,y_train)
dt_c = DecisionTreeClassifier().set_params(**param)
dt_c.fit(x_train, y_train)

print("Score with test data",dt_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'splitter': 'best', 'presort': False, 'min_samples_split': 0.22500364645016824, 'min_samples_leaf': 5, 'max_features': None, 'criterion': 'entropy', 'class_weight': None}

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'splitter': 'best', 'presort': False, 'min_samples_split': 0.029074150117447295, 'min_samples_leaf': 4, 'max_features': None, 'criterion': 'entropy', 'class_weight': 'balanced'}

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'splitter': 'best', 'presort': False, 'min_samples_split': 0.033389156254798436, 'min_samples_leaf': 5, 'max_features': None, 'criterion': 'entropy', 'class_weight': None}

Score with test data 1.0


In [0]:
# --------------------> Random Forest

param = RF(x_train,y_train)
rf_c = RandomForestClassifier().set_params(**param)
rf_c.fit(x_train, y_train)

print("Score with test data",rf_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'warm_start': False, 'n_estimators': 20, 'min_samples_split': 0.01986813096009843, 'min_samples_leaf': 2, 'max_features': None, 'criterion': 'gini'}

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'warm_start': False, 'n_estimators': 260, 'min_samples_split': 0.01986813096009843, 'min_samples_leaf': 3, 'max_features': None, 'criterion': 'entropy'}

Model with rank: 3
Mean validation score: 0.994 (std: 0.003)
Parameters: {'warm_start': False, 'n_estimators': 150, 'min_samples_split': 0.02124718871422049, 'min_samples_leaf': 1, 'max_features': 'auto', 'criterion': 'gini'}

Score with test data 1.0


In [0]:
# --------------------> Adaboost

param = Ada(x_train,y_train)
ada_c = AdaBoostClassifier().set_params(**param)
ada_c.fit(x_train, y_train)

print("Score with test data",ada_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'n_estimators': 280}

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'n_estimators': 460}

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'n_estimators': 240}

Score with test data 1.0


In [0]:
# ---------------------> Logistic regression

param = LR(x_train,y_train)
lr_c = LogisticRegression().set_params(**param)
lr_c.fit(x_train, y_train)

print("Score with test data",lr_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'warm_start': True, 'solver': 'liblinear', 'penalty': 'l1', 'multi_class': 'ovr', 'fit_intercept': False, 'C': 0.43907250236114026}

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'warm_start': False, 'solver': 'liblinear', 'penalty': 'l1', 'multi_class': 'ovr', 'fit_intercept': True, 'C': 0.249902819355884}

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'warm_start': True, 'solver': 'liblinear', 'penalty': 'l1', 'multi_class': 'auto', 'fit_intercept': False, 'C': 0.8539782134068196}

Score with test data 1.0


In [0]:
# ---------------------> Gaussian NB

param = GNB(x_train,y_train)
gnb_c = GaussianNB().set_params(**param)
gnb_c.fit(x_train, y_train)

print("Score with test data",gnb_c.score(x_test, y_test))

Model with rank: 1
Mean validation score: 0.994 (std: 0.005)
Parameters: {'var_smoothing': 0.0058240255230948}

Model with rank: 1
Mean validation score: 0.994 (std: 0.005)
Parameters: {'var_smoothing': 0.004623293763277192}

Model with rank: 3
Mean validation score: 0.993 (std: 0.005)
Parameters: {'var_smoothing': 0.03142584337157728}

Model with rank: 3
Mean validation score: 0.993 (std: 0.005)
Parameters: {'var_smoothing': 0.0394842325395619}

Model with rank: 3
Mean validation score: 0.993 (std: 0.005)
Parameters: {'var_smoothing': 0.019793138660693432}

Score with test data 0.9862068965517241


In [0]:
# ---------------------> Neural Network

param = NN(x_train,y_train)
nn_c = MLPClassifier().set_params(**param)
nn_c.fit(x_train, y_train)

print("Score with test data",nn_c.score(x_test, y_test))

# Did not converge



In [0]:

## --------------------> SVM

param = SVM(x_train,y_train)
svm_c = svm.SVC().set_params(**param)
svm_c.fit(x_train, y_train)

print("Score with test data",svm_c.score(x_test, y_test))

NameError: ignored