In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
path = "data/chap6/newsCorpora.csv"

### 50. データの入手・整形

In [2]:
# 2
df = pd.read_csv(path, sep='\t', header=None)
df.columns = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']
publishers = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']
selected_df = df[df['PUBLISHER'].isin(publishers)]

In [3]:
# 3
selected_df = selected_df.sample(frac=1)

In [4]:
# 4
train_df, valid_test_df = train_test_split(selected_df, train_size=0.8)
valid_df, test_df = train_test_split(valid_test_df, train_size=0.5)
columns = ['CATEGORY','TITLE']
train_df.to_csv("data/chap6/train.txt", columns=columns, sep='\t', header=False, index=False)
test_df.to_csv("data/chap6/test.txt", columns=columns, sep='\t', header=False, index=False)
valid_df.to_csv("data/chap6/valid.txt", columns=columns, sep='\t', header=False, index=False)

In [5]:
selected_df['CATEGORY'].value_counts()

b    5627
e    5279
t    1524
m     910
Name: CATEGORY, dtype: int64

### 51. 特徴量抽出

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
vec_count = CountVectorizer()
x_train = vec_count.fit_transform(train_df['TITLE'])
x_test = vec_count.transform(test_df['TITLE'])
x_valid = vec_count.transform(valid_df['TITLE'])

np.savetxt('data/chap6/train.feature.txt', x_train.toarray(), fmt='%d') # スパース行列から密行列に変換
np.savetxt('data/chap6/valid.feature.txt', x_valid.toarray(), fmt='%d')
np.savetxt('data/chap6/test.feature.txt', x_test.toarray(), fmt='%d')

### 52. 学習

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, train_df['CATEGORY'])

### 53. 予測

In [None]:
dic = {'b':'business', 't':'science and technology', 'e' : 'entertainment', 'm' : 'health'}
def predict(text):
    text = [text]
    x = vec_count.transform(text)
    ls_proba = clf.predict_proba(x)
    for proba in ls_proba:
        for c, p in zip(clf.classes_, proba):
            print (dic[c]+':',p)
s = train_df.iloc[0]['TITLE']
print(s)
predict(s)

### 54. 正解率の計測

In [None]:
from sklearn.metrics import accuracy_score
y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test)
y_train = train_df['CATEGORY']
y_test = test_df['CATEGORY']
print (accuracy_score(y_train, y_train_pred))
print (accuracy_score(y_test, y_test_pred))