In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
%matplotlib inline

In [2]:
data_train = pd.read_table('training.txt', sep = ' ', header = None, names = ['doc_id','feature_index','tf-idf'])

In [3]:
len_data_train = data_train['doc_id'].max()
len_data_train

1842

In [4]:
data_test = pd.read_table('testing.txt', sep = ' ', header = None, names = ['doc_id','feature_index','tf-idf'])

In [5]:
data_test['doc_id'] = data_test['doc_id'].map(lambda x : x + len_data_train)

In [6]:
merged_train_test_data = data_train.copy()
merged_train_test_data = merged_train_test_data.append(data_test.copy(), ignore_index=True)
merged_train_test_data # max(merged_train_test_data['doc_id']) = 2794 (1842 + 952)

Unnamed: 0,doc_id,feature_index,tf-idf
0,1,6,1.00
1,1,160,31.00
2,1,438,1.00
3,1,479,1.00
4,1,618,1.00
5,1,958,1.00
6,1,1195,52.00
7,1,1228,0.89
8,1,1516,1.00
9,1,1551,1.00


In [7]:
merged_doc_feature_train_test = merged_train_test_data.pivot('doc_id', columns = 'feature_index', values = 'tf-idf')
merged_doc_feature_train_test = merged_doc_feature_train_test.fillna(0.0)
merged_doc_feature_train_test

feature_index,2,3,4,5,6,8,10,12,15,17,...,26350,26351,26352,26354,26355,26356,26357,26360,26362,26364
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
doc_feature_df_train = merged_doc_feature_train_test[0:len_data_train]
doc_feature_df_train

feature_index,2,3,4,5,6,8,10,12,15,17,...,26350,26351,26352,26354,26355,26356,26357,26360,26362,26364
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
doc_feature_df_test = merged_doc_feature_train_test[len_data_train:]
doc_feature_df_test

feature_index,2,3,4,5,6,8,10,12,15,17,...,26350,26351,26352,26354,26355,26356,26357,26360,26362,26364
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1843,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1845,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
1846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1848,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1849,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1850,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1851,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
doc_class_df_train = pd.read_table('label_training.txt', sep = ' ', header = None, names = ['label'])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(doc_feature_df_train, doc_class_df_train, random_state=1)

In [12]:
clf = GaussianNB()
print('_' * 80)
print("Training: ")
print(clf)
clf.fit(X_train, y_train.label)
y_pred = clf.predict(X_test)
score = metrics.accuracy_score(y_test.label, y_pred)
print("type:   %s" % type(y_pred))
print("accuracy:   %0.6f" % score)

________________________________________________________________________________
Training: 
GaussianNB()
type:   <class 'numpy.ndarray'>
accuracy:   0.980477


In [13]:
y_test_pred = clf.predict(doc_feature_df_test)
len(y_test_pred)

952

In [14]:
y_test_pred

array([ 1, -1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,
        1,  1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,
        1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,
        1,  1,  1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1, -1,  1, -1,
       -1, -1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1,
        1, -1, -1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1, -1, -1,  1, -1,
       -1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,
        1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,
        1,  1,  1, -1,  1

In [15]:
np.savetxt('720004946-1.txt', y_test_pred, delimiter='\n', fmt='%d')