In [129]:
# -*- coding: utf-8 -*-

In [130]:
import itertools
import numpy as np
import pandas as pd
import MeCab

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn import cross_validation, metrics

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [131]:
SEED = 2016

# Reading Data

In [132]:
data_path = '../input/zatsudanClassifierTrainingData.csv'

In [133]:
data = pd.read_csv(data_path, header=None, names=['sentence', 'category'])

In [134]:
data.head()

Unnamed: 0,sentence,category
0,元気ですか,zatsudan
1,最近どう？,zatsudan
2,ロンドンの天気教えて,notzatsudan
3,最近仕事うまくいってる,zatsudan
4,火花はどう思う,zatsudan


In [135]:
data['class'] = data['category'].map({'zatsudan':1, 'notzatsudan':0})

In [136]:
data.head()

Unnamed: 0,sentence,category,class
0,元気ですか,zatsudan,1
1,最近どう？,zatsudan,1
2,ロンドンの天気教えて,notzatsudan,0
3,最近仕事うまくいってる,zatsudan,1
4,火花はどう思う,zatsudan,1


In [137]:
data['class'].value_counts()

1    104
0     95
Name: class, dtype: int64

# Data Parsing

In [138]:
# Testing Mecab Parsing
t = MeCab.Tagger("-Owakati")
result = t.parse(data.loc[0]['sentence'])
print result

元気 です か 



In [139]:
data['parsed'] = data['sentence'].apply(lambda sentence: t.parse(sentence)[:-1])

In [140]:
data.head()

Unnamed: 0,sentence,category,class,parsed
0,元気ですか,zatsudan,1,元気 です か
1,最近どう？,zatsudan,1,最近 どう ？
2,ロンドンの天気教えて,notzatsudan,0,ロンドン の 天気 教え て
3,最近仕事うまくいってる,zatsudan,1,最近 仕事 うまく いっ てる
4,火花はどう思う,zatsudan,1,火花 は どう 思う


# Building Models

### Building features: X_train_counts

In [141]:
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()
X_train_counts = count_vect.fit_transform(list(data['parsed']))
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

print X_train_counts.shape
print X_train_tfidf.shape

(199, 417)
(199, 417)


## Testing Model

### CV Loop

In [149]:
def cv_loop(model, X, y, n):
    
    mean_accuracy = 0.0

    for i in range(n):

        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
            X, y, test_size=.20, random_state=i*SEED)

        model.fit(X_train, y_train) 
        preds = model.predict(X_cv)

        accuracy = metrics.accuracy_score(y_cv, preds)
        print "Accuracy (fold %d/%d): %f" % (i + 1, n, accuracy)
        mean_accuracy += accuracy
    
    print '-------'
    print "Mean Accuracy: %f" % (mean_accuracy/n)

In [153]:
model =  MultinomialNB()
cv_loop(model, X_train_counts, data['class'], 10)

Accuracy (fold 1/10): 0.625000
Accuracy (fold 2/10): 0.650000
Accuracy (fold 3/10): 0.625000
Accuracy (fold 4/10): 0.775000
Accuracy (fold 5/10): 0.800000
Accuracy (fold 6/10): 0.800000
Accuracy (fold 7/10): 0.775000
Accuracy (fold 8/10): 0.700000
Accuracy (fold 9/10): 0.625000
Accuracy (fold 10/10): 0.725000
-------
Mean Accuracy: 0.710000


In [154]:
model =  MultinomialNB()
cv_loop(model, X_train_tfidf, data['class'], 10)

Accuracy (fold 1/10): 0.675000
Accuracy (fold 2/10): 0.650000
Accuracy (fold 3/10): 0.650000
Accuracy (fold 4/10): 0.650000
Accuracy (fold 5/10): 0.850000
Accuracy (fold 6/10): 0.800000
Accuracy (fold 7/10): 0.775000
Accuracy (fold 8/10): 0.675000
Accuracy (fold 9/10): 0.550000
Accuracy (fold 10/10): 0.675000
-------
Mean Accuracy: 0.695000


In [155]:
model =  RandomForestClassifier()
cv_loop(model, X_train_counts, data['class'], 10)

Accuracy (fold 1/10): 0.700000
Accuracy (fold 2/10): 0.825000
Accuracy (fold 3/10): 0.625000
Accuracy (fold 4/10): 0.850000
Accuracy (fold 5/10): 0.725000
Accuracy (fold 6/10): 0.850000
Accuracy (fold 7/10): 0.825000
Accuracy (fold 8/10): 0.775000
Accuracy (fold 9/10): 0.700000
Accuracy (fold 10/10): 0.825000
-------
Mean Accuracy: 0.770000


In [156]:
model =  RandomForestClassifier()
cv_loop(model, X_train_tfidf, data['class'], 10)

Accuracy (fold 1/10): 0.675000
Accuracy (fold 2/10): 0.750000
Accuracy (fold 3/10): 0.675000
Accuracy (fold 4/10): 0.850000
Accuracy (fold 5/10): 0.775000
Accuracy (fold 6/10): 0.775000
Accuracy (fold 7/10): 0.850000
Accuracy (fold 8/10): 0.825000
Accuracy (fold 9/10): 0.575000
Accuracy (fold 10/10): 0.800000
-------
Mean Accuracy: 0.755000


In [157]:
model =  SVC()
cv_loop(model, X_train_counts, data['class'], 10)

Accuracy (fold 1/10): 0.525000
Accuracy (fold 2/10): 0.600000
Accuracy (fold 3/10): 0.550000
Accuracy (fold 4/10): 0.325000
Accuracy (fold 5/10): 0.575000
Accuracy (fold 6/10): 0.375000
Accuracy (fold 7/10): 0.375000
Accuracy (fold 8/10): 0.500000
Accuracy (fold 9/10): 0.450000
Accuracy (fold 10/10): 0.600000
-------
Mean Accuracy: 0.487500


In [158]:
model = SVC()
cv_loop(model, X_train_tfidf, data['class'], 10)

Accuracy (fold 1/10): 0.525000
Accuracy (fold 2/10): 0.600000
Accuracy (fold 3/10): 0.550000
Accuracy (fold 4/10): 0.325000
Accuracy (fold 5/10): 0.575000
Accuracy (fold 6/10): 0.375000
Accuracy (fold 7/10): 0.375000
Accuracy (fold 8/10): 0.500000
Accuracy (fold 9/10): 0.450000
Accuracy (fold 10/10): 0.600000
-------
Mean Accuracy: 0.487500


# Next Steps: Model Hyperparameters tuning