In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics



In [2]:
raw_traininput = pd.read_csv('../data/train_input.csv')
raw_trainoutput = pd.read_csv('../data/train_output.csv')

raw_testinput = pd.read_csv('../data/test_input.csv')

traininput_size = raw_traininput.shape[0]
testinput_size = raw_testinput.shape[0]

In [3]:
import re

def clean_data(data):
    data_size = data.shape[0]
    tag_regex = '<.*?>|\n'
    
    cleaned_data = pd.DataFrame([ re.sub(tag_regex, '', data['conversation'][i]) \
                    for i in range(data_size) ], columns = {('conversation')})
    
    return cleaned_data

In [4]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

tuned_parameters = [{'C': [1, 2, 3, 4, 5, 6]}]

clean_traininput = raw_traininput
print clean_traininput.shape

cvec = CountVectorizer(analyzer='word', 
                       stop_words = 'english',
                       ngram_range = (1,5))

classification = Pipeline([('vectorizer', cvec),
                           ('transformer', TfidfTransformer()),
                           ('classifier', GridSearchCV(LinearSVC(loss='hinge'), 
                                                       tuned_parameters, cv=3,
                                                       scoring='precision_macro'))])


train_data, test_data, train_labels, test_labels = train_test_split(clean_traininput['conversation'], 
                                                                    raw_trainoutput['category'], 
                                                                    test_size=0.2, 
                                                                    random_state=105)

classification = classification.fit(train_data, train_labels)

predicted = classification.predict(test_data)
print("Detailed classification report:")
print np.mean(predicted == test_labels)
print metrics.classification_report(test_labels, predicted)
print metrics.confusion_matrix(test_labels, predicted)

(165000, 2)
Detailed classification report:
0.966393939394
             precision    recall  f1-score   support

     hockey       0.98      0.98      0.98      4170
     movies       0.98      0.99      0.99      4510
        nba       0.99      0.97      0.98      3696
       news       0.94      0.91      0.92      4256
        nfl       0.98      0.98      0.98      3978
   politics       0.93      0.95      0.94      3943
     soccer       0.99      0.99      0.99      4299
  worldnews       0.95      0.96      0.95      4148

avg / total       0.97      0.97      0.97     33000

[[4085   15   24    3   25    1   14    3]
 [   6 4473    1   11    1    6    3    9]
 [  40   18 3576    8   31    3   18    2]
 [   1   17    3 3874   13  183    2  163]
 [  28    5   18    7 3910    3    6    1]
 [   2    1    1  150    1 3749    1   38]
 [  16    5    5    4    8    2 4248   11]
 [   2    8    0   85    0   67   10 3976]]


In [6]:
print("Best parameters set found on development set:")
print(classification.named_steps['classifier'].best_params_)
print("Grid scores on development set:")
means = classification.named_steps['classifier'].cv_results_['mean_test_score']
stds = classification.named_steps['classifier'].cv_results_['std_test_score']
for mean, std, params in zip(means, stds, classification.named_steps['classifier'].cv_results_['params']):
    print("%0.5f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

Best parameters set found on development set:
{'C': 2}
Grid scores on development set:
0.95645 (+/-0.001) for {'C': 1}
0.95875 (+/-0.001) for {'C': 2}
0.95867 (+/-0.001) for {'C': 3}
0.95867 (+/-0.001) for {'C': 4}
0.95867 (+/-0.001) for {'C': 5}
0.95866 (+/-0.001) for {'C': 6}


In [5]:
clean_testinput = raw_testinput
print clean_testinput.shape

cvec2 = CountVectorizer(analyzer='word', 
                       stop_words = 'english',
                       ngram_range = (1,3))

classification2 = Pipeline([('vectorizer', cvec2),
                           ('transformer', TfidfTransformer()),
                           ('classifier', LinearSVC(loss='hinge'))])

classification2 = classification2.fit(clean_traininput['conversation'], raw_trainoutput['category'])
predicted2 = classification2.predict(clean_testinput['conversation'])

(53218, 2)


In [6]:
result = pd.DataFrame({'category' : predicted2})
result.to_csv('../data/test_predict.csv', index = True, header = True, index_label = 'id')

In [7]:
clean_testinput.shape

(53218, 2)