In [18]:
import os
import pandas as pd
import numpy as np
from pickle import dump, load

import torch
import torch.nn as nn

In [19]:
# 读取数据, 观察数据格式, 文本内容, 进行数据清洗, 提取特征
def read_data(filename):
    df = pd.read_csv(filename, encoding='utf-8', sep='\t')
    return df

In [20]:
def get_train():
    train_processed = './data/train_processed.csv'
    if os.path.exists(train_processed) and os.path.isfile(train_processed):
        df_train = read_data(train_processed)
    else:
        train_data = './data/train_set.csv'
        df_train = read_data(train_data)
        # 获取每条样本的长度
        df_train.loc[:,'text_len'] = df_train.text.apply(lambda x: len(x.split()))
        df_train.to_csv(train_processed, sep='\t')

    return df_train

In [21]:
df_train = get_train()
test_data = './data/test_a.csv'
df_test = read_data(test_data)

In [22]:
df_train.columns

Index(['label', 'text', 'text_len'], dtype='object')

In [23]:
x = df_train.loc[:,'text']


文本数据以被脱敏处理, 无法进行去停用词处理
可进行的处理还有, 高频词处理, 逆文档词处理, 在这里先不进行此类处理
首先构造一个简单的baseline

In [24]:
# 导入机器学习分类器必须的包
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# 导入f1-score结算器
from sklearn.metrics import f1_score

In [25]:
# 获取机器学习分类器的输入和标签
# sklearn中的特征提取类Countectorizer可以直接使用text列
x = df_train.text.values.tolist()
y = df_train.label.values.tolist()
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)

In [26]:
# 基于sklearn定义一个文本分类器,
class text_classifier():
    def __init__(self, classifier=MultinomialNB(),
                vectorizer=CountVectorizer(analyzer='word',
                            ngram_range=(1,4),
                            max_features=20000)):
        self.classifier = classifier
        self.vectorizer = vectorizer
    
    def get_features(self, x):
        # x is a list of words string splited by space 
        return self.vectorizer.transform(x)

    def fit(self, x, y):
        self.vectorizer.fit(x)
        self.classifier.fit(self.get_features(x), y)

    def predict(self, x):
        return self.classifier.predict(self.get_features(x))
    
    def score(self, x, y):
        return self.classifier.score(self.get_features(x), y)

    def save_model(self, model_file):
        with open(model_file, 'wb') as f:
            dump((self.classifier, self.vectorizer), f)
    
    def load_model(self, model_file):
        with open(model_file, 'rb') as f:
            self.classifier, self.vectorizer = load(f)

In [27]:
# 使用默认的分类器朴素贝叶斯分类中的multinomialNB()和特征抽取工具
classifier_NB = text_classifier()
model_file = './data/multinomialNB.h5'
if os.path.exists(model_file) and os.path.isfile(model_file):
    classifier_NB.load_model(model_file)
else:
    classifier_NB.fit(x_train, y_train)
    classifier_NB.save_model(model_file)

In [28]:
y_predict = classifier_NB.predict(x_test)
print(f1_score(y_test, y_predict, average='micro'))
print(f1_score(y_test, y_predict, average='macro'))
print(f1_score(y_test, y_predict, average='weighted'))

0.8568
0.8076463748567075
0.8598682072412864


In [29]:
# 使用SVM分类器和特征抽取工具
classifier = SVC(kernel='linear')
vectorizer = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1,3),
    max_features=20000)
classifier_svc = text_classifier(classifier=classifier, vectorizer=vectorizer)
model_svc = './data/svc.h5'
if os.path.exists(model_file) and os.path.isfile(model_svc):
    classifier_svc.load_model(model_svc)
else:
    classifier_svc.fit(x_train, y_train)
    classifier_svc.save_model(model_svc)

In [None]:
y_predict = classifier_svc.predict(x_test)
print(f1_score(y_test, y_predict, average='micro'))
print(f1_score(y_test, y_predict, average='macro'))
print(f1_score(y_test, y_predict, average='weighted'))