分类模型
NB/LR/SVM/LSTM(GRU)/CNN

In [12]:
# Import packages
import jieba
import random
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

In [2]:
datadir = './data/'
processed_data_file = './data/processed.csv'
stopwords_file = './data/stopwords.txt'

In [3]:
def get_stopwords():
    stopwords=pd.read_csv(stopwords_file,
                    index_col=False,quoting=3,sep="\t",
                    names=['stopword'], encoding='utf-8')
    stopwords=stopwords['stopword'].values
    return stopwords

In [4]:
def text_preprocess(lines):
    sentences = []
    stopwords = get_stopwords()
    for line in lines:
        try:
            line = line.strip()
            segs = jieba.lcut(line)
            segs = list(filter(lambda x: len(x) > 1, segs))
            segs = list(filter(lambda x: x not in stopwords, segs))
            sentences.append(" ".join(segs))
        except Exception as e:
            print(line)
            print(e)
            break
    return sentences

In [5]:
def get_datasets():
    df = None
    if os.path.exists(processed_data_file):
        if os.path.isfile(processed_data_file):
            df = pd.read_csv(processed_data_file)
        else:
            print("filename of processed data need to be changed.")
        return df
    datasets = []
    datafiles = [join(datadir, f) for f in listdir(datadir) if f.endswith('.csv')]
    for data_file in datafiles:
        category = data_file[7:-9]
        df = pd.read_csv(data_file, encoding='utf-8')
        df = df.dropna()
        lines = df.content.values.tolist()
        sentences = text_preprocess(lines)
        datasets.extend(list(zip(sentences, [category]*len(sentences))))
    random.shuffle(datasets)
    contents, labels = zip(*datasets)
    df = pd.DataFrame({'content':contents, 'labels':labels})
    df.dropna()
    df.to_csv('./data/processed.csv')
    return df   


In [6]:
df = get_datasets()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.966 seconds.
Prefix dict has been built successfully.


In [7]:
from sklearn.model_selection import train_test_split
x = df.content.values.tolist()
y = df.labels.values.tolist()
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)

In [8]:
# 从处理后(降噪)数据中抽取特征 (抽取词袋模型特征)
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(
    analyzer='word',          # tokenise by character ngrams
    max_features=4000         # keep the most common 4000 ngrams
)
vec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=4000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [15]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)
print(classifier.score(vec.transform(x_test), y_test))


vec = CountVectorizer(
    analyzer='word',          # tokenise by character ngrams
    ngram_range=(1,4),        # use ngrams of size 1,2,3
    max_features=4000         # keep the most common 4000 ngrams
)
vec.fit(x_train)
classifier.fit(vec.transform(x_train), y_train)
print(classifier.score(vec.transform(x_test), y_test))

0.6269911042899078
0.6268122960468477


从上面的结果可以看出, 虽然使用了ngram, 但是分类器的效果并没有得到提升, 下面我们使用K折交叉验证的方式 对数据进行拟合

In [16]:
# 使用交叉验证
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score

def stratifiedKFold_CV(x, y, clf_class, shuffle=True, n_folds=5, **kwargs):
    stratifiedk_fold = StratifiedKFold(n_splits=n_folds, shuffle=shuffle)
    y_pred = y[:]
    for train_index, test_index in stratifiedk_fold.split(x, y):
        x_train, x_test = x[train_index], x[test_index]
        y_train = y[train_index]
        clf_class.fit(x_train, y_train)
        y_pred[test_index] = clf_class.predict(x_test)
    return y_pred

clf_class = MultinomialNB()
print(precision_score(y, stratifiedKFold_CV(vec.transform(x), np.array(y), NB), average='macro'))

0.5311764118935225


In [None]:
print(classifier.predict('这 是 有史以来 最 大 的 一 次 军舰 演习'))