In [28]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
path = "data/chap6/newsCorpora.csv"

### 50. データの入手・整形

In [4]:
# 2
df = pd.read_csv(path, sep='\t', header=None)
df.columns = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']
publishers = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']
selected_df = df[df['PUBLISHER'].isin(publishers)]

In [5]:
# 3
selected_df = selected_df.sample(frac=1)

In [6]:
# 4
train_df, valid_test_df = train_test_split(selected_df, train_size=0.8)
valid_df, test_df = train_test_split(valid_test_df, train_size=0.5)
columns = ['CATEGORY','TITLE']
train_df.to_csv("data/chap6/train.txt", columns=columns, sep='\t', header=False, index=False)
test_df.to_csv("data/chap6/test.txt", columns=columns, sep='\t', header=False, index=False)
valid_df.to_csv("data/chap6/valid.txt", columns=columns, sep='\t', header=False, index=False)

In [7]:
selected_df['CATEGORY'].value_counts()

b    5627
e    5279
t    1524
m     910
Name: CATEGORY, dtype: int64

### 51. 特徴量抽出

In [8]:
vec_count = CountVectorizer()
x_train = vec_count.fit_transform(train_df['TITLE'])
x_test = vec_count.transform(test_df['TITLE'])
x_valid = vec_count.transform(valid_df['TITLE'])

np.savetxt('data/chap6/train.feature.txt', x_train.toarray(), fmt='%d') # スパース行列から密行列に変換
np.savetxt('data/chap6/valid.feature.txt', x_valid.toarray(), fmt='%d')
np.savetxt('data/chap6/test.feature.txt', x_test.toarray(), fmt='%d')

### 52. 学習

In [9]:
clf = LogisticRegression()
clf.fit(x_train, train_df['CATEGORY'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### 53. 予測

In [10]:
dic = {'b':'business', 't':'science and technology', 'e' : 'entertainment', 'm' : 'health'}
def predict(text):
    text = [text]
    x = vec_count.transform(text)
    ls_proba = clf.predict_proba(x)
    for proba in ls_proba:
        for c, p in zip(clf.classes_, proba):
            print (dic[c]+':',p)
s = train_df.iloc[0]['TITLE']
print(s)
predict(s)

Teenage Mutant Ninja Turtles: How Not To Market A Movie
business: 0.12069455687130613
entertainment: 0.854277946925457
health: 0.007477001523512991
science and technology: 0.01755049467972399


### 54. 正解率の計測

In [11]:
y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test)
y_train = train_df['CATEGORY']
y_test = test_df['CATEGORY']
print (accuracy_score(y_train, y_train_pred))
print (accuracy_score(y_test, y_test_pred))

0.9962518740629686
0.9047976011994003


### 55. 混同行列の作成

In [25]:
labels = ['b','t','e','m']
train_cm = confusion_matrix(y_train, y_train_pred, labels=labels)
test_cm = confusion_matrix(y_test, y_test_pred, labels=labels)
train_cm_labeled = pd.DataFrame(train_cm, columns=labels, index=labels)
test_cm_labeled = pd.DataFrame(test_cm, columns=labels, index=labels)
print(train_cm_labeled)
print(test_cm_labeled)

      b     t     e    m
b  4472     6     1    0
t    14  1225     3    0
e     8     2  4220    0
m     4     0     2  715
     b    t    e   m
b  530   16   16   3
t   26  109   13   1
e    9    3  510   3
m   14    8   15  58


### 56. 適合率，再現率，F1スコアの計測

In [52]:
ps = precision_score(y_test, y_test_pred, average=None, labels=labels)
rs = recall_score(y_test, y_test_pred, average=None, labels=labels)
f1 = f1_score(y_test, y_test_pred, average=None, labels=labels)

df = pd.DataFrame(data={'precision score': ps, 'recall score': rs, 'f1 score': f1}, index = labels)
print(df)

   precision score  recall score  f1 score
b         0.915371      0.938053  0.926573
t         0.801471      0.731544  0.764912
e         0.920578      0.971429  0.945320
m         0.892308      0.610526  0.725000


In [53]:
micro_ps = precision_score(y_test, y_test_pred, average='micro', labels=labels)
micro_rs = recall_score(y_test, y_test_pred, average='micro', labels=labels)
micro_f1 = f1_score(y_test, y_test_pred, average='micro', labels=labels)

micro_df = pd.DataFrame(data={'precision score': micro_ps, 'recall score': micro_rs, 'f1 score': micro_f1}, index = labels)
print(micro_df)

   precision score  recall score  f1 score
b         0.904798      0.904798  0.904798
t         0.904798      0.904798  0.904798
e         0.904798      0.904798  0.904798
m         0.904798      0.904798  0.904798


In [54]:
macro_ps = precision_score(y_test, y_test_pred, average='macro', labels=labels)
macro_rs = recall_score(y_test, y_test_pred, average='macro', labels=labels)
macro_f1 = f1_score(y_test, y_test_pred, average='macro', labels=labels)

macro_df = pd.DataFrame(data={'precision score': macro_ps, 'recall score': macro_rs, 'f1 score': macro_f1}, index = labels)
print(macro_df)

   precision score  recall score  f1 score
b         0.882432      0.812888  0.840451
t         0.882432      0.812888  0.840451
e         0.882432      0.812888  0.840451
m         0.882432      0.812888  0.840451


### 57. 特徴量の重みの確認

In [61]:
names = np.array(vec_count.get_feature_names())
labels=['b','t','e','m']
for c, coef in zip(clf.classes_, clf.coef_): # カテゴリ毎に表示する
    idx = np.argsort(coef)[::-1] # 降順ソート
    print (dic[c])
    print (names[idx][:10]) # 重みの高い特徴量トップ10
    print (names[idx][-10:][::-1]) # 重みの低い特徴量トップ10

[ 0.17613076  0.0200178   0.00428534 ... -0.00044779 -0.05595252
 -0.00114271]
business
['bank' 'fed' 'ecb' 'china' 'yellen' 'obamacare' 'ukraine' 'dollar' 'euro'
 'oil']
['activision' 'aereo' 'ebola' 'twitch' 'cap' 'she' 'heartbleed'
 'subscription' 'nintendo' 'virus']
[-0.10577076 -0.01482196 -0.00225462 ...  0.00464226  0.11809141
  0.00391573]
entertainment
['chris' 'paul' 'kardashian' 'thrones' 'miley' 'transformers' 'film'
 'movie' 'cyrus' 'beyonce']
['google' 'facebook' 'gm' 'china' 'billion' 'risk' 'ebola' 'microsoft'
 'study' 'climate']
[-0.03916612 -0.00257808 -0.00096667 ... -0.00184498 -0.02072804
 -0.00081899]
health
['ebola' 'fda' 'cancer' 'drug' 'study' 'mers' 'cases' 'cdc' 'doctors'
 'alzheimer']
['gm' 'dimon' 'facebook' 'climate' 'apple' 'twitter' 'google' 'bank'
 'sales' 'play']
[-0.03119388 -0.00261776 -0.00106405 ... -0.00234949 -0.04141085
 -0.00195403]
science and technology
['google' 'facebook' 'microsoft' 'climate' 'apple' 'activision' 'tesla'
 'heartbleed' 'nas

### 58. 正則化パラメータの変更