In [32]:
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

In [2]:
# Seed the random number generator:
np.random.seed(1)

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

In [3]:
X = load_data('training_data.txt')
y = X[:, 0]
X = X[:, 1:]

In [33]:
clf1 = LogisticRegression(random_state=1)
clf2 = ExtraTreesClassifier(random_state=1, criterion ='entropy', 
                              max_depth = None, max_features = 10, 
                              min_samples_leaf = 2, min_samples_split = 10, n_estimators = 400)
clf3 = SVC(kernel = 'rbf', C=10, gamma=0.001, probability=True)
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft')

In [None]:
for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Extra Forest', 'SVC', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.84 (+/- 0.00) [Logistic Regression]
Accuracy: 0.85 (+/- 0.01) [Extra Forest]
Accuracy: 0.85 (+/- 0.01) [SVC]


In [27]:
eclf.fit(X,y)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFore...='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [28]:
X_test = load_data('test_data.txt')

In [29]:
pred = eclf.predict(X_test)

In [30]:
def format_pred(pred):
    result = [int(i) for i in pred]
    with open("result.txt", "w") as f:
        f.write("Id,Prediction\n") 
        for i in range(1, len(result) + 1):
            f.write(str(i) + "," + str(result[i-1]) + "\n")

In [31]:
format_pred(pred)