## 8.4 テキスト学習：単純ベイズによる自然言語処理

In [1]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.cross_validation as cv
import sklearn.grid_search as gs
import sklearn.feature_extraction.text as text
import sklearn.naive_bayes as nb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# 屈辱的な単語か否かのコメント
df = pd.read_csv("data/troll.csv")

In [3]:
df[['Insult','Comment']].tail()

Unnamed: 0,Insult,Comment
3942,1,"""you are both morons and that is never happening"""
3943,0,"""Many toolbars include spell check, like Yahoo..."
3944,0,"""@LambeauOrWrigley\xa0\xa0@K.Moss\xa0\nSioux F..."
3945,0,"""How about Felix? He is sure turning into one ..."
3946,0,"""You're all upset, defending this hipster band..."


In [4]:
y = df['Insult']

In [5]:
# 語彙の抽出
# 各単語の使用頻度によるスパースマトリクスを作成する
tf = text.TfidfVectorizer()
X = tf.fit_transform(df['Comment'])
print(X.shape)

(3947, 16469)


In [6]:
print("コメント数 = {1},単語数={2}").format(X.nnz,X.shape[0],X.shape[1])
print(("Each sample has ~{0:.2f}% non-zero" "features.").format(100 * X.nnz / float(X.shape[0] * X.shape[1])))

コメント数 = 3947,単語数=16469
Each sample has ~0.15% non-zerofeatures.


In [7]:
# 分類器を訓練する（訓練用とテスト用に分割する）
(X_train, X_test, y_train, y_test) = cv.train_test_split(X, y, test_size=.2)

In [8]:
# ベルヌーイ単純ベイズ分類器
bnb = gs.GridSearchCV(nb.BernoulliNB(),param_grid={'alpha':np.logspace(-2.,2.,50)})
bnb.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-02,   1.20679e-02,   1.45635e-02,   1.75751e-02,
         2.12095e-02,   2.55955e-02,   3.08884e-02,   3.72759e-02,
         4.49843e-02,   5.42868e-02,   6.55129e-02,   7.90604e-02,
         9.54095e-02,   1.15140e-01,   1.38950e-01,   1.67683e-01,
         2....    3.90694e+01,   4.71487e+01,   5.68987e+01,   6.86649e+01,
         8.28643e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [9]:
# テスト用データで分析器の性能を測る
bnb.score(X_test,y_test)

0.73164556962025318

In [10]:
# 大きな係数に対応する単語（屈辱的なコメントで頻繁にみられる単語）
names = np.asarray(tf.get_feature_names())
print(','.join(names[np.argsort(bnb.best_estimator_.coef_[0,:])[::-1][:50]]))

you,your,are,the,to,and,of,that,it,is,in,like,on,for,have,not,re,just,xa0,idiot,an,so,get,all,with,don,what,be,up,go,fuck,can,do,stupid,this,as,about,if,know,or,no,but,who,ass,bitch,back,because,my,yourself,me


In [11]:
# テスト用の文章を推定器にかける
print(bnb.predict(tf.transform([
                "I totally agree with you.",
                "You are so stupid you.",
                "I love you."
                ])))

[0 1 1]


### 参考文献

* http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
* https://ja.wikipedia.org/wiki/Tf-idf
* https://ja.wikipedia.org/wiki/単純ベイズ分類器
* https://blog.kaggle.com/2012/09/26/impermium-andreas-blog
* http://www.nltk.org/