In [95]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics

In [96]:
raw_traininput = pd.read_csv('../data/train_input.csv')
raw_trainoutput = pd.read_csv('../data/train_output.csv')

raw_testinput = pd.read_csv('../data/test_input.csv')

traininput_size = raw_traininput.shape[0]
testinput_size = raw_testinput.shape[0]

In [97]:
import re

def clean_data(data):
    data_size = data.shape[0]
    tag_regex = '<.*?>|\n|-|\'s?|com'
    
    cleaned_data = pd.DataFrame([ re.sub(tag_regex, '', data['conversation'][i]) \
                    for i in range(data_size) ], columns = {('conversation')})
    
    return cleaned_data

In [98]:
from sklearn.svm import LinearSVC

clean_traininput = clean_data(raw_traininput)
print clean_traininput.shape

cvec = CountVectorizer(analyzer='word', ngram_range = (1,2))#, stop_words = 'english')

classification = Pipeline([('vectorizer', cvec), \
                           ('transformer', TfidfTransformer()), \
                           ('classifier', LinearSVC())])

train_data, test_data, train_labels, test_labels = train_test_split(clean_traininput['conversation'], \
                                                                        raw_trainoutput['category'], test_size=0.2, random_state=105)

classification = classification.fit(train_data, train_labels)
predicted = classification.predict(test_data)
print np.mean(predicted == test_labels)
print metrics.classification_report(test_labels, predicted)
print metrics.confusion_matrix(test_labels, predicted)

(165000, 1)
0.957333333333
             precision    recall  f1-score   support

     hockey       0.98      0.98      0.98      4170
     movies       0.98      0.99      0.99      4510
        nba       0.98      0.97      0.97      3696
       news       0.91      0.89      0.90      4256
        nfl       0.97      0.98      0.98      3978
   politics       0.92      0.93      0.93      3943
     soccer       0.98      0.98      0.98      4299
  worldnews       0.93      0.95      0.94      4148

avg / total       0.96      0.96      0.96     33000

[[4073   17   23    5   31    0   17    4]
 [   2 4462    1   16    2    9    2   16]
 [  40   17 3570   10   34    2   19    4]
 [   3   20    6 3772   17  222    3  213]
 [  30    7   26   15 3887    1    9    3]
 [   2    5    1  217    2 3654    3   59]
 [  11    7    8    8   13    4 4232   16]
 [   6   13    0  105    1   64   17 3942]]


In [99]:
from sklearn.svm import LinearSVC

clean_traininput = clean_data(raw_traininput)
print clean_traininput.shape

cvec = CountVectorizer(analyzer='word', ngram_range = (1,3))#, stop_words = 'english')

classification = Pipeline([('vectorizer', cvec), \
                           ('transformer', TfidfTransformer()), \
                           ('classifier', LinearSVC())])

train_data, test_data, train_labels, test_labels = train_test_split(clean_traininput['conversation'], \
                                                                        raw_trainoutput['category'], test_size=0.2, random_state=105)

classification = classification.fit(train_data, train_labels)
predicted = classification.predict(test_data)
print np.mean(predicted == test_labels)
print metrics.classification_report(test_labels, predicted)
print metrics.confusion_matrix(test_labels, predicted)

(165000, 1)
0.958060606061
             precision    recall  f1-score   support

     hockey       0.98      0.98      0.98      4170
     movies       0.98      0.99      0.98      4510
        nba       0.98      0.96      0.97      3696
       news       0.92      0.88      0.90      4256
        nfl       0.97      0.98      0.98      3978
   politics       0.93      0.93      0.93      3943
     soccer       0.98      0.98      0.98      4299
  worldnews       0.92      0.96      0.94      4148

avg / total       0.96      0.96      0.96     33000

[[4074   15   23    6   32    0   16    4]
 [   2 4465    1   12    2    9    1   18]
 [  47   19 3556    9   36    3   22    4]
 [   2   25    6 3764   18  214    3  224]
 [  24    7   21   16 3900    1    8    1]
 [   3    4    1  204    2 3671    3   55]
 [  15    9    8    9   11    6 4224   17]
 [   6   12    0   90    1   61   16 3962]]


In [100]:
from sklearn.svm import LinearSVC

clean_traininput = clean_data(raw_traininput)
print clean_traininput.shape

cvec = CountVectorizer(analyzer='word', ngram_range = (1,3), stop_words = 'english')

classification = Pipeline([('vectorizer', cvec), \
                           ('transformer', TfidfTransformer()), \
                           ('classifier', LinearSVC())])

train_data, test_data, train_labels, test_labels = train_test_split(clean_traininput['conversation'], \
                                                                        raw_trainoutput['category'], test_size=0.2, random_state=105)

classification = classification.fit(train_data, train_labels)
predicted = classification.predict(test_data)
print np.mean(predicted == test_labels)
print metrics.classification_report(test_labels, predicted)
print metrics.confusion_matrix(test_labels, predicted)

(165000, 1)
0.960545454545
             precision    recall  f1-score   support

     hockey       0.98      0.98      0.98      4170
     movies       0.98      0.99      0.99      4510
        nba       0.98      0.97      0.97      3696
       news       0.92      0.89      0.91      4256
        nfl       0.98      0.98      0.98      3978
   politics       0.93      0.94      0.93      3943
     soccer       0.99      0.98      0.99      4299
  worldnews       0.93      0.96      0.94      4148

avg / total       0.96      0.96      0.96     33000

[[4067   19   24    7   31    0   16    6]
 [   2 4471    1   10    2    5    3   16]
 [  41   16 3573    9   31    3   17    6]
 [   2   23    3 3792   18  218    1  199]
 [  23    6   23   13 3899    3    8    3]
 [   1    4    1  192    2 3698    1   44]
 [  11    8   10    6   12    2 4234   16]
 [   6   13    0   87    0   61   17 3964]]


In [101]:
clean_testinput = clean_data(raw_testinput)
print clean_testinput.shape

cvec2 = CountVectorizer(analyzer='word', ngram_range = (1,3), stop_words = 'english')

classification2 = Pipeline([('vectorizer', cvec2), \
                           ('transformer', TfidfTransformer()), \
                           ('classifier', LinearSVC())])

classification2 = classification2.fit(clean_traininput['conversation'], raw_trainoutput['category'])
predicted2 = classification2.predict(clean_testinput['conversation'])

(53218, 1)


In [102]:
result = pd.DataFrame({'category' : predicted2})
result.to_csv('../data/test_predict.csv', index = True, header = True, index_label = 'id')

In [103]:
clean_testinput.shape

(53218, 1)