In [6]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics

In [7]:
raw_traininput = pd.read_csv('../data/train_input.csv')
raw_trainoutput = pd.read_csv('../data/train_output.csv')

raw_testinput = pd.read_csv('../data/test_input.csv')

traininput_size = raw_traininput.shape[0]
testinput_size = raw_testinput.shape[0]

In [8]:
import re

def clean_data(data):
    data_size = data.shape[0]
    tag_regex = '<.*?>|\n'
    
    cleaned_data = pd.DataFrame([ re.sub(tag_regex, '', data['conversation'][i]) \
                    for i in range(data_size) ], columns = {('conversation')})
    
    return cleaned_data

In [9]:
def add_speaker_count(data):
    data = data.join(pd.DataFrame(data.shape[0]*[0], columns={'speakers'}, dtype=int))
    for i in range(data.shape[0]):
        data.loc[i,('speakers')] = len(re.findall('<speaker_[0-9]>', data['conversation'][i]))
    return data

cl = add_speaker_count(raw_traininput)


In [10]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from scipy.sparse import csr_matrix, hstack

train_data, test_data, train_labels, test_labels = train_test_split(np.array((cl['conversation'], cl['speakers'])).T, 
                                                                    raw_trainoutput['category'], 
                                                                    test_size=0.2, 
                                                                    random_state=105)


In [11]:
cvec = CountVectorizer(analyzer='word', 
                       stop_words = 'english',
                       ngram_range = (1,3))
tfidf_transformer = TfidfTransformer()

In [12]:
X_train_counts = cvec.fit_transform(train_data[:,0])
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

sp_train_sparse = csr_matrix(train_data[:,1].tolist()).T
x_train = hstack([X_train_tfidf, sp_train_sparse])
print train_data.shape, X_train_tfidf.shape, x_train.shape

(132000, 2) (132000, 5930230) (132000, 5930231)


In [13]:
X_test_counts = cvec.transform(test_data[:,0])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

sp_test_sparse = csr_matrix(test_data[:,1].tolist()).T
x_test = hstack([X_test_tfidf, sp_test_sparse])
print test_data.shape, X_test_tfidf.shape, x_test.shape

(33000, 2) (33000, 5930230) (33000, 5930231)


In [14]:
clf = LinearSVC(loss = 'hinge').fit(x_train, train_labels)

In [15]:
predicted = clf.predict(x_test)

In [16]:
print np.mean(predicted == test_labels)
print metrics.classification_report(test_labels, predicted)
print metrics.confusion_matrix(test_labels, predicted)

0.96403030303
             precision    recall  f1-score   support

     hockey       0.98      0.98      0.98      4170
     movies       0.98      0.99      0.99      4510
        nba       0.99      0.97      0.98      3696
       news       0.93      0.90      0.91      4256
        nfl       0.98      0.98      0.98      3978
   politics       0.93      0.94      0.94      3943
     soccer       0.99      0.99      0.99      4299
  worldnews       0.94      0.96      0.95      4148

avg / total       0.96      0.96      0.96     33000

[[4082   15   22    6   25    1   15    4]
 [   2 4473    1   10    0    6    4   14]
 [  36   18 3579    8   30    3   21    1]
 [   3   21    3 3847   15  192    2  173]
 [  25    5   17   10 3907    4    8    2]
 [   2    1    1  180    1 3715    1   42]
 [  13    7    7    6    9    2 4242   13]
 [   3    7    0   90    0   68   12 3968]]


In [17]:
cvec2 = CountVectorizer(analyzer='word', 
                       stop_words = 'english',
                       ngram_range = (1,3))
tfidf_transformer2 = TfidfTransformer()


In [19]:
X_train_counts = cvec2.fit_transform(cl['conversation'])
X_train_tfidf = tfidf_transformer2.fit_transform(X_train_counts)

sp_train_sparse = csr_matrix(cl['speakers'].tolist()).T
X_train = hstack([X_train_tfidf, sp_train_sparse])
print cl['conversation'].shape, X_train_tfidf.shape, X_train.shape

 (165000,) (165000, 7074844) (165000, 7074845)


In [21]:
cl_test = add_speaker_count(raw_testinput)

X_test_counts = cvec2.transform(cl_test['conversation'])
X_test_tfidf = tfidf_transformer2.transform(X_test_counts)

sp_test_sparse = csr_matrix(cl_test['speakers'].tolist()).T
X_test = hstack([X_test_tfidf, sp_test_sparse])
print cl_test['conversation'].shape, X_test_tfidf.shape, X_test.shape

(53218,) (53218, 7074844) (53218, 7074845)


In [22]:
clf2 = LinearSVC(loss = 'hinge').fit(X_train,raw_trainoutput['category'])

In [23]:
predicted2 = clf2.predict(X_test)

In [24]:
result = pd.DataFrame({'category' : predicted2})
result.to_csv('../data/test_predict.csv', index = True, header = True, index_label = 'id')

In [26]:
(1-0.96915)*53218/2

820.8876500000011