In [21]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB

from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold, cross_val_score, cross_val_predict

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
data_train = pd.read_table('training.txt', sep = ' ', header = None, names = ['doc_id','feature_index','tf-idf'])

In [4]:
len_data_train = data_train['doc_id'].max()
len_data_train

1842

In [5]:
data_test = pd.read_table('testing.txt', sep = ' ', header = None, names = ['doc_id','feature_index','tf-idf'])

In [6]:
data_test['doc_id'] = data_test['doc_id'].map(lambda x : x + len_data_train)

In [7]:
merged_train_test_data = data_train.copy()
merged_train_test_data = merged_train_test_data.append(data_test.copy(), ignore_index=True)
#merged_train_test_data # max(merged_train_test_data['doc_id']) = 2794 (1842 + 952)

In [8]:
merged_doc_feature_train_test = merged_train_test_data.pivot('doc_id', columns = 'feature_index', values = 'tf-idf')
merged_doc_feature_train_test = merged_doc_feature_train_test.fillna(0.0)
#merged_doc_feature_train_test

In [9]:
doc_feature_df_train = merged_doc_feature_train_test[0:len_data_train]
#doc_feature_df_train

In [10]:
doc_feature_df_test = merged_doc_feature_train_test[len_data_train:]
#doc_feature_df_test

In [11]:
doc_class_df_train = pd.read_table('label_training.txt', sep = ' ', header = None, names = ['label'])

In [12]:
skf = StratifiedKFold(doc_class_df_train.label, n_folds=10, random_state=123)

In [13]:
def benchmark(clf, clf_name):
    print('_' * 80)
    print("Training: ")
    print(clf_name)
    
    scores = cross_val_score(estimator=clf, 
                             X=doc_feature_df_train, 
                             y=doc_class_df_train.label,
                             cv=skf,
                             scoring='accuracy')
    print('Accuracy: %0.6f (+/- %0.6f) [%s]' %(scores.mean(), scores.std(), clf))

In [14]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=2, random_state=123)
bnb = BernoulliNB(alpha=.089)

In [15]:
# BaggingClassifier with BaseEstimators of 50 BernoulliNB
bag = BaggingClassifier(base_estimator=bnb,
                        n_estimators=50, 
                        bootstrap=True, 
                        bootstrap_features=False, 
                        n_jobs=1)
benchmark(bag, 'Bagging')

________________________________________________________________________________
Training: 
Bagging
Accuracy: 0.994568 (+/- 0.004875) [BaggingClassifier(base_estimator=BernoulliNB(alpha=0.089, binarize=0.0, class_prior=None, fit_prior=True),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=50, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)]


In [16]:
# AdaBoostClassifier with BaseEstimators of 50 DecisionTreeClassifiers
ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=50, 
                         learning_rate=0.1)
benchmark(ada, 'AdaBoost')

________________________________________________________________________________
Training: 
AdaBoost
Accuracy: 0.972347 (+/- 0.013751) [AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=123, splitter='best'),
          learning_rate=0.1, n_estimators=50, random_state=None)]


In [17]:
# RandomForestClassifier with BaseEstimators of 50 DecisionTreeClassifiers
rnf = RandomForestClassifier(n_estimators=50)
benchmark(rnf, 'RandomForest')

________________________________________________________________________________
Training: 
RandomForest
Accuracy: 0.978299 (+/- 0.012833) [RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)]


In [25]:
y_pred = cross_val_predict(estimator=ada, 
                             X=doc_feature_df_train, 
                             y=doc_class_df_train.label,
                             cv=skf, n_jobs=1)

print(metrics.classification_report(doc_class_df_train.label, y_pred))

             precision    recall  f1-score   support

         -1       0.98      0.94      0.96       655
          1       0.97      0.99      0.98      1187

avg / total       0.97      0.97      0.97      1842



In [26]:
y_pred = cross_val_predict(estimator=rnf, 
                             X=doc_feature_df_train, 
                             y=doc_class_df_train.label,
                             cv=skf, n_jobs=1)

print(metrics.classification_report(doc_class_df_train.label, y_pred))

             precision    recall  f1-score   support

         -1       1.00      0.94      0.97       655
          1       0.97      1.00      0.98      1187

avg / total       0.98      0.98      0.98      1842



In [22]:
y_pred = cross_val_predict(estimator=bag, 
                             X=doc_feature_df_train, 
                             y=doc_class_df_train.label,
                             cv=skf, n_jobs=1)

In [24]:
print(metrics.classification_report(doc_class_df_train.label, y_pred))

             precision    recall  f1-score   support

         -1       1.00      0.99      0.99       655
          1       0.99      1.00      1.00      1187

avg / total       1.00      1.00      1.00      1842



In [19]:
np.savetxt('720004946-3.txt', y_test_pred, delimiter='\n', fmt='%d')