In [4]:
import numpy as np
import itertools
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

%matplotlib inline

Try range of parameters for several common machine learning methods to select best parameters by validation accuracy.

In [21]:
def set_param(clf_id, clf, v):
    if clf_id in [1, 2, 4, 5]:
        clf.alpha = v
    elif clf_id in [3, 6]:
        clf.C = v
    elif clf_id in [7]:
        clf.min_samples_leaf = v
    elif clf_id in [8]:
        clf.n_neighbors = v
    return clf

R = 100 #arbitrary random state
CLASSIFIERS = {
    0: GaussianNB(),
    1: MultinomialNB(), #alpha
    2: Lasso(), #alpha
    3: LogisticRegression(random_state=R), #C
    4: Perceptron(random_state=R), #alpha
    5: RidgeClassifier(), #alpha
    6: SVC(kernel='linear'), #C
    7: DecisionTreeClassifier(random_state=R), #min_samples_leaf
    8: KNeighborsClassifier(), #n_neighbors
    9: NearestCentroid(),    
    10: AdaBoostClassifier(random_state=R), #n_estimators 50
    11: BaggingClassifier(), #n_estimators 10
    12: GradientBoostingClassifier(), #learning_rate 0.1    
    13: RandomForestClassifier(), #n_estimators 10
    14: LDA(),
    15: QDA()
    }
    
C_VALUES = np.power(float(10), range(-3, 3)) #use for C and alpha
PARAMS = {
    0: [0],    
    1: C_VALUES,
    2: C_VALUES,
    3: C_VALUES,
    4: C_VALUES,
    5: C_VALUES,
    6: C_VALUES,
    7: range(4, 30, 2),
    8: range(3, 16, 2),
    9: [0],
    10: [0],
    11: [0],
    12: [0],
    13: [0],
    14: [0],
    15: [0],
    }

Load training data and try different feature representations.

In [7]:
# training data 
M = 1000 # number of features
results = []
with open('training_data.txt') as inputfile:
    for line in inputfile:
        results.append(line.strip().split('|'))
N_tr = len(results)-1 # number of training examples
X_train = np.array([map(float, results[i][0:-1]) for i in range(1, N_tr+1)])
Y_tr = np.array([float(results[i][-1]) for i in range(1, N_tr+1)])

# feature processing
X_fea = {}
X_fea[0] = X_train
lda = LatentDirichletAllocation(random_state=R, n_components=5).fit(X_train, Y_tr)
X_fea[1] = lda.transform(X_train)
lda = LatentDirichletAllocation(random_state=R, n_components=10).fit(X_train, Y_tr)
X_fea[2] = lda.transform(X_train)
lda = LatentDirichletAllocation(random_state=R, n_components=20).fit(X_train, Y_tr)
X_fea[3] = lda.transform(X_train)
lda = LatentDirichletAllocation(random_state=R, n_components=50).fit(X_train, Y_tr)
X_fea[4] = lda.transform(X_train)
lda = LatentDirichletAllocation(random_state=R, n_components=100).fit(X_train, Y_tr)
X_fea[5] = lda.transform(X_train)
tfidf = TfidfTransformer(norm='l2', sublinear_tf=True)
X_fea[6] = (tfidf.fit_transform(X_train)).todense()
scaled = StandardScaler()
for i in range(1, 6):
    scaled.fit(X_fea[i], Y_tr)
    X_fea[i] = scaled.transform(X_fea[i])

Try a few simple classifiers with different feature representations.

In [None]:
VAL = 0.3
best_score = {}
for fea_id in X_fea.keys():
    X_tr = X_fea[fea_id]
   
    # training and validation split
    Xr, Xv, Yr, Yv = train_test_split(X_tr, Y_tr, test_size=VAL, random_state=R)
   
    best_score[fea_id] = []
    for clf_id in [3, 6, 7, 4, 8]:
        clf = CLASSIFIERS[clf_id]
        score = []
        for v in PARAMS[clf_id]:
            clf = set_param(clf_id, clf, v)
            clf.fit(Xr, Yr)
            score.append(sum(clf.predict(Xv) == Yv)/float(len(Yv)))
        best_score[fea_id].append(max(score))
        
lines = {}
for classifier in range(5):
    lines[classifier] = ''.join([' & %.3f' % (best_score[x][classifier]) for x in [1,2,3,4,5,6,0]])

After finding that tf-idf representation provides better scores than LDA representation, proceed with using the tf-idf representation to train and test all the individual classifiers.

In [None]:
X_tr = X_fea[6]
Xr, Xv, Yr, Yv = train_test_split(X_tr, Y_tr, test_size=VAL, random_state=R)

best_output = {}
best_param = {}
best_err_val = {}
best_err_tr = {}
for clf_id in CLASSIFIERS.keys():
    clf = CLASSIFIERS[clf_id]
    score = []
    clf_output = []
    err_tr = []
    for v in PARAMS[clf_id]:
        clf = set_param(clf_id, clf, v)
        clf.fit(Xr, Yr)
        clf_output.append(clf.predict(Xv))
        score.append(sum(clf.predict(Xv) == Yv)/float(len(Yv)))
        err_tr.append(sum(clf.predict(Xr) == Yr)/float(len(Yr)))
    ind = score.index(max(score))
    best_output[clf_id] = clf_output[ind]
    best_param[clf_id] = PARAMS[clf_id][ind]
    best_err_tr[clf_id] = err_tr[ind]
    best_err_val[clf_id] = max(score)
    print(clf_id)

Test combinations of individual classifiers.

In [None]:
# ensemble methods
P = np.array(best_output.values())
P = np.delete(P, 2, axis=0)

N_CLASSIFIERS = len(CLASSIFIERS)
N_VOTES = range(3, N_CLASSIFIERS, 2)

ens_best_param = {}
ens_best_score = {}
ens_err_train = {}

for n_votes in N_VOTES:
    combinations = list(itertools.combinations(range(15), n_votes))
    ens_best_score[n_votes] = 0
    for combo in combinations:
        o = sum(P[combo, :]) > float(n_votes)/2
        ens_score = sum(o == Yv)/float(len(Yv))
        if ens_score > ens_best_score[n_votes]:
            ens_best_score[n_votes] = ens_score
            ens_best_param[n_votes] = combo
    print(n_votes)

In [None]:
# bagging
clf = BaggingClassifier()
bag_err_tr = []
bag_err_val = []
N_ESTIMATORS = range(5, 101, 5)
for v in N_ESTIMATORS:
    clf.n_estimators = v
    clf.fit(Xr, Yr)
    bag_err_val.append(sum(clf.predict(Xv) == Yv)/float(len(Yv)))
    bag_err_tr.append(sum(clf.predict(Xr) == Yr)/float(len(Yr)))
    print(v)

In [None]:
plt.plot(N_ESTIMATORS, bag_err_tr)
plt.plot(N_ESTIMATORS, bag_err_val)
plt.ylim([0.65, 1])
plt.legend(['Training', 'Validation'])
plt.xlabel('Number of Estimators')
plt.ylabel('Accuracy Rate')
plt.show()

Prepare a final classifier for the test data based on previous experiments. 

In [34]:
# test data
results = []
with open('test_data.txt') as inputfile:
    for line in inputfile:
        results.append(line.strip().split('|'))
N_ts = len(results)-1
X_test = np.array([map(float, results[i]) for i in range(1, N_ts+1)])

# feature processing
tfidf = TfidfTransformer(norm='l2', sublinear_tf=True)
X_tr = (tfidf.fit_transform(X_train)).todense()
X_ts = (tfidf.transform(X_test)).todense()

# train final ensemble classifier
best_n_votes = [x for x in ens_best_score.keys() if ens_best_score[x] == max(ens_best_score.values())][0]
selected_classifiers = list(ens_best_param[best_n_votes])
final_output = []
for clf_id in selected_classifiers:
    clf = CLASSIFIERS[clf_id]
    clf = set_param(clf_id, clf, best_param[clf_id])
    clf.fit(X_tr, Y_tr)    
    final_output.append(clf.predict(X_ts))
final_prediction = sum(np.array(final_output)) > float(best_n_votes)/2

# write predictions to text file
test_id = 0
with open('prediction.txt', 'w') as outputfile:
    outputfile.write('Id,Prediction\n')
    for item in final_prediction:
        test_id += 1
        outputfile.write('%s,%.0f\n' % (test_id,item))