In [None]:
import os
import pandas as pd
import numpy as np
from pickle import dump, load

In [None]:
# 读取数据, 观察数据格式, 文本内容, 进行数据清洗, 提取特征
def read_data(filename):
    df = pd.read_csv(filename, encoding='utf-8', sep='\t')
    return df

In [None]:
train_data = './data/train_set.csv'
test_data = './data/test_a.csv'
df_train = read_data(train_data)
df_test = read_data(test_data)

文本数据以被脱敏处理, 无法进行去停用词处理
可进行的处理还有, 高频词处理, 逆文档词处理, 在这里先不进行此类处理
首先构造一个简单的baseline

In [None]:
# 导入机器学习分类器必须的包
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [None]:
# 获取机器学习分类器的输入和标签
# sklearn中的特征提取类Countectorizer可以直接使用text列
x = df_train.text.values.tolist()
y = df_train.label.values.tolist()
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)

In [None]:
# 基于sklearn定义一个文本分类器,
class text_classifier():
    def __init__(self, classifier=MultinomialNB(),
                vectorizer=CountVectorizer(analyzer='word',
                            ngram_range=(1,4),
                            max_features=20000)):
        self.classifier = classifier
        self.vectorizer = vectorizer
    
    def get_features(self, x):
        # x is a list of words string splited by space 
        return self.vectorizer.transform(x)

    def fit(self, x, y):
        self.vectorizer.fit(x)
        self.classifier.fit(self.get_features(x), y)

    def predict(self, x):
        return self.classifier.predict(self.get_features([x]))
    
    def score(self, x, y):
        return self.classifier.score(self.get_features(x), y)

    def save_model(self, model_file):
        with open(model_file, 'wb') as f:
            dump((self.classifier, self.vectorizer), f)
    
    def load_model(self, model_file):
        with open(model_file, 'rb') as f:
            self.classifier, self.vectorizer = load(f)

In [None]:
# 使用默认的分类器朴素贝叶斯分类中的multinomialNB()和特征抽取工具
classifier = text_classifier()
model_file = './data/multinomialNB.h5'
if os.path.exists(model_file) and os.path.isfile(model_file):
    classifier.load_model(model_file)
else:
    classifier.fit(x_train, y_train)
    classifier.save_model(model_file)

In [None]:
# x_test = df_test.text.values.tolist()
# y_test = df_test.label.values.tolist()
classifier.score(x_train, y_train)