In [1]:
%%time
from tool import *
import pandas as pd

boards = ['Gossiping', 'C_Chat', 'WomenTalk', 'Baseball', 'HatePolitics', 'NBA']

df = pd.DataFrame(get_data(connect_db(), {'board': {'$in': boards}, }, projection={
    'raw_title':1, 'raw_text': 1, '_id': 0, 'board': 1
}))

df['raw_corpus'] = df['raw_title']+ ' ' + df['raw_text']
sp = SentenceProcessor()
# print(df.head())
corpus = [sp.cut_and_remove(title) for title in df['raw_corpus']]
answer = df['board']

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.716 seconds.
Prefix dict has been built succesfully.


CPU times: user 2.15 s, sys: 232 ms, total: 2.39 s
Wall time: 22.7 s


## Bag of Words 詞袋模型

In [2]:
# BOW
from sklearn.feature_extraction.text import CountVectorizer

def get_bow(corpus):
    count_vec = CountVectorizer(stop_words='english', max_features=500)
    # fit corpus
    x_bow = count_vec.fit_transform([' '.join(sentence) for sentence in corpus])
    return np.array(x_bow.toarray()), count_vec

x_bow, count_vec = get_bow(corpus)

## TF-IDF 

* sklearn TfidfVectorizer

In [3]:
# tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tfidf(corpus):
    tfidf = TfidfVectorizer(stop_words='english', max_features=500)
    x_tfidf = tfidf.fit_transform([' '.join(sentence) for sentence in corpus])
    x_tfidf = np.array(x_tfidf.toarray())
    return x_tfidf, tfidf

x_tfidf, tfidf = get_tfidf(corpus)

In [4]:
%%time
# word2vec
from gensim.models import Word2Vec
import numpy as np


def build_word2vec_model(size=100, window=5, min_count=5):
    model = Word2Vec(corpus, size=size, window=window, workers=4, min_count=min_count)
    model.save('models/w2v_size_{}_window_{}_min_count_{}.model'.format(size, window, min_count))
    return model

def get_avg_vector(content, word2id, w2v_model, size=100):
    ans = np.array(np.zeros(size, ), dtype="float32")
    cnt = 0
    for word in content:
        if word in word2id.keys():
            cnt += 1
            ans = np.add(ans, w2v_model[word])
    return np.divide(ans, cnt)

def get_w2v(corpus, word2id, model, size):
    return np.array([get_avg_vector(sentence, word2id, model, size) for sentence in corpus])

w2v_model = build_word2vec_model(size=300, window=5, min_count=5)
word2id ={v :k+1 for k, v in enumerate(w2v_model.wv.vocab.keys())}
print(len(word2id))
x_w2v = get_w2v(corpus, word2id, w2v_model, size=300) 

19751
CPU times: user 20.3 s, sys: 256 ms, total: 20.5 s
Wall time: 9.43 s




In [5]:
# 切割資料
from sklearn.model_selection import train_test_split


def data_split(X, y, test_size=0.2, random_state=40):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)
X_train_bow, X_test_bow, y_train_bow, y_test_bow = data_split(x_bow, answer)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = data_split(x_tfidf, answer)
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = data_split(x_w2v, answer)

In [6]:
# save model

from sklearn.externals import joblib

def save_model(model, file):
    joblib.dump(model, 'models/{}'.format(file))
    
def load_model(file):
    return joblib.load('models/{}'.format(file))



In [7]:
# MultinomialNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


def MNB(X_train, y_train, X_test, y_test):
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    print(mnb.score(X_test, y_test))
    y_predict = mnb.predict(X_test)
    print(classification_report(y_test, y_predict))
    return mnb

mnb = MNB(X_train_bow, y_train_bow, X_test_bow, y_test_bow)
save_model(mnb, 'MultinomialNB.model')

0.7951176983435048
              precision    recall  f1-score   support

    Baseball       0.92      0.80      0.85       177
      C_Chat       0.73      0.65      0.69       193
   Gossiping       0.85      0.71      0.78       182
HatePolitics       0.81      0.83      0.82       204
         NBA       0.92      0.93      0.93       192
   WomenTalk       0.63      0.84      0.72       199

    accuracy                           0.80      1147
   macro avg       0.81      0.79      0.80      1147
weighted avg       0.81      0.80      0.80      1147



In [8]:
# SVM

from sklearn.svm import LinearSVC

def get_svc_model(X_train, y_train, X_test, y_test):
    svc = LinearSVC()
    svc.fit(X_train, y_train)
    print(svc.score(X_test, y_test))
    y_predict = svc.predict(X_test)
    print(classification_report(y_test, y_predict))
    return svc

svc = get_svc_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf)
save_model(mnb, 'linearSVC.model')
# svc.predict(get_w2v([document], word2id, w2v_model))

0.8605056669572798
              precision    recall  f1-score   support

    Baseball       0.96      0.90      0.93       177
      C_Chat       0.80      0.79      0.79       193
   Gossiping       0.90      0.90      0.90       182
HatePolitics       0.82      0.84      0.83       204
         NBA       0.95      0.96      0.96       192
   WomenTalk       0.76      0.79      0.77       199

    accuracy                           0.86      1147
   macro avg       0.86      0.86      0.86      1147
weighted avg       0.86      0.86      0.86      1147



In [9]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

def get_rf_model(X_train, y_train, X_test, y_test):
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    print(rf.score(X_test, y_test))
    y_predict = rf.predict(X_test)
    print(classification_report(y_test, y_predict))
    return rf

rf = get_rf_model(X_train_w2v, y_train_w2v, X_test_w2v, y_test_w2v)
save_model(rf, 'randomForest.model')



0.6521360069747166
              precision    recall  f1-score   support

    Baseball       0.75      0.79      0.77       177
      C_Chat       0.53      0.60      0.56       193
   Gossiping       0.44      0.45      0.45       182
HatePolitics       0.81      0.71      0.76       204
         NBA       0.86      0.84      0.85       192
   WomenTalk       0.55      0.53      0.54       199

    accuracy                           0.65      1147
   macro avg       0.66      0.65      0.65      1147
weighted avg       0.66      0.65      0.65      1147

