In [33]:
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression

from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold, cross_val_score

import matplotlib.pyplot as plt
%matplotlib inline

In [34]:
data_train = pd.read_table('training.txt', sep = ' ', header = None, names = ['doc_id','feature_index','tf-idf'])

In [35]:
len_data_train = data_train['doc_id'].max()
len_data_train

1842

In [36]:
data_test = pd.read_table('testing.txt', sep = ' ', header = None, names = ['doc_id','feature_index','tf-idf'])

In [37]:
data_test['doc_id'] = data_test['doc_id'].map(lambda x : x + len_data_train)

In [38]:
merged_train_test_data = data_train.copy()
merged_train_test_data = merged_train_test_data.append(data_test.copy(), ignore_index=True)
#merged_train_test_data # max(merged_train_test_data['doc_id']) = 2794 (1842 + 952)

In [39]:
merged_doc_feature_train_test = merged_train_test_data.pivot('doc_id', columns = 'feature_index', values = 'tf-idf')
merged_doc_feature_train_test = merged_doc_feature_train_test.fillna(0.0)
#merged_doc_feature_train_test

In [40]:
doc_feature_df_train = merged_doc_feature_train_test[0:len_data_train]
#doc_feature_df_train

In [41]:
doc_feature_df_test = merged_doc_feature_train_test[len_data_train:]
#doc_feature_df_test

In [42]:
doc_class_df_train = pd.read_table('label_training.txt', sep = ' ', header = None, names = ['label'])

In [43]:
skf = StratifiedKFold(doc_class_df_train.label, n_folds=10, random_state=123)

In [44]:
def benchmark(clf, clf_name):
    print('_' * 80)
    print("Training: ")
    print(clf_name)
    
    scores = cross_val_score(estimator=clf, 
                             X=doc_feature_df_train, 
                             y=doc_class_df_train.label,
                             cv=skf,
                             scoring='accuracy')
    print('Accuracy: %0.6f (+/- %0.6f) [%s]' %(scores.mean(), scores.std(), clf))

In [45]:
svm = LinearSVC(loss='squared_hinge', penalty='l1', dual=False, tol=1e-3)
benchmark(svm, 'LinearSVC with L1-based feature selection')

________________________________________________________________________________
Training: 
LinearSVC with L1-based feature selection
Accuracy: 0.974521 (+/- 0.012325) [LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.001,
     verbose=0)]


In [46]:
sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")
benchmark(sgd, 'SGD Classifier with Elastic-Net penalty')

________________________________________________________________________________
Training: 
SGD Classifier with Elastic-Net penalty
Accuracy: 0.822541 (+/- 0.091048) [SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)]


In [50]:
log = LogisticRegression()
benchmark(log, 'Logistic Regression')

________________________________________________________________________________
Training: 
Logistic Regression
Accuracy: 0.986454 (+/- 0.007329) [LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)]


In [47]:
clf_final = LinearSVC(loss='squared_hinge', penalty='l1', dual=False, tol=1e-3)
clf_final.fit(doc_feature_df_train, doc_class_df_train.label)
y_test_pred = clf_final.predict(doc_feature_df_test)

In [49]:
np.savetxt('720004946-2.txt', y_test_pred, delimiter='\n', fmt='%d')