In [8]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics

In [9]:
raw_traininput = pd.read_csv('../data/train_input.csv')
raw_trainoutput = pd.read_csv('../data/train_output.csv')

raw_testinput = pd.read_csv('../data/test_input.csv')

traininput_size = raw_traininput.shape[0]
testinput_size = raw_testinput.shape[0]

In [10]:
import re

def clean_data(data):
    data_size = data.shape[0]
    tag_regex = '<.*?>|\n'
    
    cleaned_data = pd.DataFrame([ re.sub(tag_regex, '', data['conversation'][i]) \
                    for i in range(data_size) ], columns = {('conversation')})
    
    return cleaned_data

In [13]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

clean_traininput = raw_traininput
print clean_traininput.shape

cvec = CountVectorizer(analyzer='word', 
                       stop_words = 'english',
                       ngram_range = (1,3))

classification = Pipeline([('vectorizer', cvec),
                           ('transformer', TfidfTransformer()),
                           ('classifier', LinearSVC(loss='hinge'))])

train_data, test_data, train_labels, test_labels = train_test_split(clean_traininput['conversation'], 
                                                                    raw_trainoutput['category'], 
                                                                    test_size=0.2, 
                                                                    random_state=105)

classification = classification.fit(train_data, train_labels)
predicted = classification.predict(test_data)
print np.mean(predicted == test_labels)
print metrics.classification_report(test_labels, predicted)
print metrics.confusion_matrix(test_labels, predicted)

(165000, 2)
0.964
             precision    recall  f1-score   support

     hockey       0.98      0.98      0.98      4170
     movies       0.98      0.99      0.99      4510
        nba       0.99      0.97      0.98      3696
       news       0.93      0.90      0.92      4256
        nfl       0.98      0.98      0.98      3978
   politics       0.93      0.94      0.94      3943
     soccer       0.99      0.99      0.99      4299
  worldnews       0.94      0.96      0.95      4148

avg / total       0.96      0.96      0.96     33000

[[4085   15   19    5   25    1   16    4]
 [   2 4472    1   11    0    6    4   14]
 [  40   18 3573    8   32    3   21    1]
 [   3   21    3 3846   15  191    2  175]
 [  27    5   17    9 3906    4    8    2]
 [   2    1    1  179    1 3715    1   43]
 [  14    7    6    6    9    2 4242   13]
 [   3    7    0   86    0   67   12 3973]]


In [5]:
clean_testinput = raw_testinput
print clean_testinput.shape

cvec2 = CountVectorizer(analyzer='word', 
                       stop_words = 'english',
                       ngram_range = (1,3))

classification2 = Pipeline([('vectorizer', cvec2),
                           ('transformer', TfidfTransformer()),
                           ('classifier', LinearSVC(loss='hinge'))])

classification2 = classification2.fit(clean_traininput['conversation'], raw_trainoutput['category'])
predicted2 = classification2.predict(clean_testinput['conversation'])

(53218, 2)


In [6]:
result = pd.DataFrame({'category' : predicted2})
result.to_csv('../data/test_predict.csv', index = True, header = True, index_label = 'id')

In [7]:
clean_testinput.shape

(53218, 2)