# 提取文本特征
词向量 + 字向量

In [1]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
import jieba
import zhconv

## 保存分词结果

In [5]:
def save_cut_word_rst(file_path):
    data = pd.read_csv(file_path + ".csv", usecols=['content'])
    with open(file_path + "_words_list.txt", 'w') as f_w:
        for content in data['content'].values:
            content = zhconv.convert(content.strip(), 'zh-cn')
            content = list(
                filter(lambda x: len(x.strip()) > 0, list(jieba.cut(content))))
            f_w.write(' '.join(content) + "\n")

In [6]:
for file_path in [
    './datasets/trainingset',
    './datasets/validationset',
    './datasets/testa'
]:
    save_cut_word_rst(file_path)

In [7]:
with open("./datasets/all_dataset_word_list.txt", 'w') as f_w:
    for file_path in [
            './datasets/trainingset',
            './datasets/validationset',
            './datasets/testa',
    ]:
        with open(file_path + '_words_list.txt') as f_r:
            for line in f_r:
                f_w.write(line)

## 词向量

In [8]:
# 停用词

def get_stop_word_set(only_punctuation=False):
    words_set = set()
    fname = './datasets/哈工大停用标点表.txt' if only_punctuation else \
        './datasets/哈工大停用词表扩展.txt'
    with open(fname) as f_r:
        for line in f_r:
            words_set |= set(line.strip())
    if only_punctuation:
        words_set |= set([' '])
    return words_set

In [9]:
class MySentence:
    def __init__(self, dirname, filter_ws):
        self.dirname = dirname
        self.filter_ws = filter_ws

    def __iter__(self):
        for line in open(self.dirname):
            yield list(
                filter(lambda x: x not in self.filter_ws,
                       line.strip().split()))

In [10]:
sentences = MySentence("./datasets/all_dataset_word_list.txt",
                       get_stop_word_set(only_punctuation=True))
model = Word2Vec(sentences,
                 sg=1,
                 size=100,
                 compute_loss=True,
                 window=5,
                 workers=8,
                 iter=8,
                 min_count=2)
print(model.get_latest_training_loss(), len(model.wv.vocab))
model.save("./saved/word2vec.model")

69164128.0 127527


## 字向量

In [11]:
# 保存所有单字，到同一文件，追加模式 'a'


def save_char_content(save_path, fpath, stop_word_set):
    data = pd.read_csv(fpath, usecols=['content'])
    with open(save_path, 'a') as f_w:
        for con in data['content'].values:
            f_w.write(" ".join(
                list(
                    filter(
                        lambda x: x not in stop_word_set and len(x.strip()) >
                        0, zhconv.convert(con, 'zh-cn')))) + '\n')

In [12]:
# 保存单字到对应的，单个文件


def save_char_content_single(fpath, stop_word_set):
    data = pd.read_csv(fpath, usecols=['content'])
    fpath = fpath[:fpath.rfind('.')] + '_char_list.txt'
    print(fpath)
    with open(fpath, 'w') as f_w:
        for con in data['content'].values:
            f_w.write(' '.join(
                list(
                    filter(
                        lambda x: x not in stop_word_set and len(x.strip()) >
                        0, zhconv.convert(con, 'zh-cn')))) + '\n')

In [15]:
all_csv = [
    './datasets/trainingset.csv',
    './datasets/validationset.csv',
    './datasets/testa.csv',
]
stop_word_set = get_stop_word_set(only_punctuation=True)
save_char_path = './datasets/all_char_list.txt'
for path in all_csv:
    save_char_content(save_char_path, path, stop_word_set)
    save_char_content_single(path, stop_word_set)

./datasets/trainingset_char_list.txt
./datasets/validationset_char_list.txt
./datasets/testa_char_list.txt


In [16]:
sentences = MySentence(save_char_path, [])
model = Word2Vec(sentences,
                 sg=1,
                 size=100,
                 compute_loss=True,
                 window=10,
                 workers=8,
                 iter=15,
                 min_count = 2)
print(model.get_latest_training_loss())
print(len(model.wv.vocab))
model.save('./saved/char2vec.model')

73594880.0
6380


# 文本统计特征

In [17]:
import numpy as np
import pickle

In [18]:
def get_data(file_path):
    data = []
    with open(file_path) as f:
        for line in f:
            data.append(line.strip())
    return data

In [19]:
train_content_ori = get_data(
    './datasets/trainingset_words_list.txt',
)
val_content_ori = get_data(
    './datasets/validationset_words_list.txt'
)
test_content_ori = get_data(
    './datasets/testa_words_list.txt'
)

In [20]:
print(len(train_content_ori), len(val_content_ori), len(test_content_ori))

105000 15000 15000


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train_content_ori)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [23]:
train_content = vectorizer.transform(train_content_ori)
val_content = vectorizer.transform(val_content_ori)
test_content = vectorizer.transform(test_content_ori)

In [24]:
from sklearn.decomposition import TruncatedSVD


In [25]:
svd = TruncatedSVD(n_components=20*4, n_iter=7, random_state=2018)
svd.fit(train_content)

TruncatedSVD(algorithm='randomized', n_components=80, n_iter=7,
             random_state=2018, tol=0.0)

In [26]:
train_svd = svd.transform(train_content)
val_svd = svd.transform(val_content)
test_svd = svd.transform(test_content)

In [27]:
prefix = 'svd_tfidf_80'
np.save('./saved/%s_train' % prefix, train_svd)
np.save('./saved/%s_val' % prefix, val_svd)
np.save('./saved/%s_test' % prefix, test_svd)