In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer



In [12]:
unholy_trinity = set(["politics", "news", "worldnews"])

In [2]:
def split(X, y, ratio=0.2):
    train_data, test_data, train_labels, test_labels = train_test_split(X, y, 
                                                                        test_size=ratio, 
                                                                        random_state=105)
    return train_data, test_data, train_labels, test_labels

In [42]:
def train(classifier, X, y):
    cvec = CountVectorizer(analyzer='word', 
                           stop_words = 'english',
                           ngram_range = (1,3))

    classification = Pipeline([('vectorizer', cvec),
                               ('transformer', TfidfTransformer()),
                               ('classifier', classifier)])

    classification = classification.fit(X, y)
    return classification

In [24]:
def score(pred, y):
    print np.mean(pred == y)
    print metrics.classification_report(y, pred)
    print metrics.confusion_matrix(y, pred)

In [18]:
def get_unholy_trinity(X, y):
    [unholy, labels] = zip(*[
        (X[i], y[i]) for i in X.index
        if y[i] in unholy_trinity
    ])
    
    return pd.Series(list(unholy), name="conversation"), pd.Series(list(labels), name="category")

# Split

In [6]:
X = pd.read_csv('../data/train_input.csv')
y = pd.read_csv('../data/train_output.csv')
X_test = pd.read_csv('../data/test_input.csv')

In [7]:
train_X, test_X, train_y, test_y = split(X["conversation"], y["category"])

In [19]:
unholy_train_X, unholy_train_y = get_unholy_trinity(train_X, train_y)

# First stage

In [9]:
first_stage = LinearSVC(loss='hinge', dual=False)
first_stage = train(first_stage, train_X, train_y)

In [39]:
first_stage_prediction = first_stage.predict(test_X)
score(first_stage_prediction, test_y)

0.964
             precision    recall  f1-score   support

     hockey       0.98      0.98      0.98      4170
     movies       0.98      0.99      0.99      4510
        nba       0.99      0.97      0.98      3696
       news       0.93      0.90      0.92      4256
        nfl       0.98      0.98      0.98      3978
   politics       0.93      0.94      0.94      3943
     soccer       0.99      0.99      0.99      4299
  worldnews       0.94      0.96      0.95      4148

avg / total       0.96      0.96      0.96     33000

[[4085   15   19    5   25    1   16    4]
 [   2 4472    1   11    0    6    4   14]
 [  40   18 3573    8   32    3   21    1]
 [   3   21    3 3846   15  191    2  175]
 [  27    5   17    9 3906    4    8    2]
 [   2    1    1  179    1 3715    1   43]
 [  14    7    6    6    9    2 4242   13]
 [   3    7    0   86    0   67   12 3973]]


# Second stage

In [51]:
from sklearn.svm import SVC

In [None]:
second_stage = SVC(kernel='poly', degree=1)
second_stage = train(second_stage, unholy_train_X, unholy_train_y)

# Merge

In [None]:
def merge(X, original_prediction, second_classifier):
    prediction = original_prediction.copy()
    n_documents = len(X)
    indices = [i for i in range(n_documents)
               if prediction[i] in unholy_trinity]
    unholy = [X[X.index[i]] for i in indices]
    unholy_predictions = second_classifier.predict(unholy)
    for i, j in enumerate(indices):
        prediction[j] = unholy_predictions[i]
    return prediction

In [None]:
prediction = merge(test_X, first_stage_prediction, second_stage)

In [None]:
score(prediction, test_y)

# Output

In [6]:
result = pd.DataFrame({'category' : predicted2})
result.to_csv('../data/test_predict.csv', index = True, header = True, index_label = 'id')

In [7]:
clean_testinput.shape

(53218, 2)