In [1]:
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize,scale,LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
X_train=pd.read_table('data\X_train.txt', sep='\s+',header=None)
X_test=pd.read_table('data\X_test.txt', sep='\s+',header=None)
Y_train=pd.read_table('data\Y_train.txt',header=None)
Y_test=pd.read_table('data\Y_test.txt',header=None)

In [3]:
## Check for missing values
print(X_train.isnull().any().any())
print(Y_train.isnull().any())
print(X_test.isnull().any().any())
print(Y_test.isnull().any())

False
0    False
dtype: bool
False
0    False
dtype: bool


In [4]:
## Normalize and scale the data
X_train=normalize(X_train)
X_train=scale(X_train)
X_test=normalize(X_test)
X_test=scale(X_test)

In [5]:
## Label encode the target columns
le = LabelEncoder()
Y_train = le.fit_transform(Y_train.values.ravel())
Y_test=le.transform(Y_test.values.ravel())

In [6]:
def hyperopt_train_test(params):
    t = params['type']
    del params['type']
    if t == 'naive_bayes':
        clf = BernoulliNB(**params)
    elif t == 'svm':
        clf = SVC(**params)
    elif t == 'dtree':
        clf = DecisionTreeClassifier(**params)
    elif t == 'knn':
        clf = KNeighborsClassifier(**params)
    else:
        return 0
    return cross_val_score(clf, X_train, Y_train).mean()

In [7]:
space = hp.choice('classifier_type', [
    {
        'type': 'naive_bayes',
        'alpha': hp.uniform('alpha', 0.0, 2.0)
    },
    {
        'type': 'svm',
        'C': hp.uniform('C', 0, 10.0),
        'kernel': hp.choice('kernel', ['linear', 'rbf']),
        'gamma': hp.uniform('gamma', 0, 20.0)
    },
    {
        'type': 'randomforest',
        'max_depth': hp.choice('max_depth', range(1,20)),
        'max_features': hp.choice('max_features', range(1,5)),
        'n_estimators': hp.choice('n_estimators', range(1,20)),
        'criterion': hp.choice('criterion', ["gini", "entropy"]),
        'scale': hp.choice('scale', [0, 1]),
        'normalize': hp.choice('normalize', [0, 1])
    },
    {
        'type': 'knn',
        'n_neighbors': hp.choice('knn_n_neighbors', range(1,50))
    }
])

In [8]:
count = 0
best = 0
def f(params):
    global best, count
    count += 1
    acc = hyperopt_train_test(params.copy())
    if acc > best:
        print('new best:', acc, 'using', params['type'])
        best = acc
    if count % 10 == 0:
        print('iters:', count, ', acc:', acc, 'using', params)
    return {'loss': -acc, 'status': STATUS_OK}

In [9]:
trials = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=50, trials=trials)
print('best:')
print(best)

new best: 0.885204643757 using knn
iters: 10 , acc: 0.881941556757 using {'type': 'knn', 'n_neighbors': 44}
new best: 0.88982983079 using knn
new best: 0.93798379874 using svm
iters: 20 , acc: 0.882213443652 using {'type': 'knn', 'n_neighbors': 38}
new best: 0.939344788124 using svm
new best: 0.940296836516 using svm
iters: 30 , acc: 0.939752840596 using {'gamma': 2.9277750837102703, 'C': 1.3175677714196605, 'type': 'svm', 'kernel': 'linear'}
iters: 40 , acc: 0.937576412657 using {'gamma': 2.1122991379370877, 'C': 3.417073639309813, 'type': 'svm', 'kernel': 'linear'}
iters: 50 , acc: 0.884252595365 using {'type': 'knn', 'n_neighbors': 12}
best:
{'gamma': 0.1177375984714764, 'classifier_type': 1, 'C': 1.2932547321619117, 'kernel': 0}


In [10]:
t=best['classifier_type']
del best['classifier_type']

In [11]:
params = {key.split('_', 1)[-1]: item for key, item in best.items()}
print(params)

{'gamma': 0.1177375984714764, 'C': 1.2932547321619117, 'kernel': 0}


In [12]:
if t == 0:
    clf = BernoulliNB(**params)
elif t == 1:
    p=params['kernel']
    if(p==0):
        params['kernel']='linear'
    else:
        params['kernel']='rbf'
    clf = SVC(**params)
elif t == 2:
    p=params['criterion']
    if(p==0):
        params['criterion']='gini'
    else:
        params['criterion']='entropy'
    clf = DecisionTreeClassifier(**params)
elif t == 3:
    clf = KNeighborsClassifier(**params)

In [13]:
clf

SVC(C=1.2932547321619117, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1177375984714764,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [14]:
clf.fit(X_train,Y_train)
ypred=clf.predict(X_test)

In [15]:
accuracy_score(Y_test, ypred)

0.96572785883949774