In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import jieba
import jieba.posseg as pseg
import os


In [24]:
# 读取csv，如果不存在cut_words列则进行添加并写入原来的csv文件中
def append_cut_words(path, keep_name=True, reset=False, saved=True):
    df = pd.read_csv(path)
    if 'cut_words' in list(df.columns) and not reset:
        return df

    if keep_name:
        # 保留人名，直接分词后进行处理
        doc_words = [list(jieba.cut(doc)) for doc in tqdm(df['txt'])]
    else:
        # 不保留人名，去除nr后进行处理
        doc_words = [[word for word, tag in pseg.cut(doc) if tag != 'nr'] for doc in tqdm(df['txt'])]

    cut_words_all = [" ".join(cut_doc) for cut_doc in doc_words]
    df['cut_words'] = cut_words_all
    
    if saved:
        df.to_csv(path, index=False)

    return df

In [4]:
for fname in ['train.csv', 'test.csv', 'valid.csv']:
    path = os.path.join('../data/csv/', fname)
    print('start cut doc in {}...'.format(path))
    append_cut_words(path)

start cut doc in ../data/csv/train.csv...
100%|██████████| 44974/44974 [01:54<00:00, 359.11it/s]
  0%|          | 24/5621 [00:00<00:23, 233.70it/s]start cut doc in ../data/csv/test.csv...
100%|██████████| 5621/5621 [00:14<00:00, 368.65it/s]
  0%|          | 28/5624 [00:00<00:20, 274.56it/s]start cut doc in ../data/csv/valid.csv...
100%|██████████| 5624/5624 [00:14<00:00, 395.24it/s]


In [25]:
for fname in ['train.csv', 'test.csv', 'valid.csv']:
    path = os.path.join('../data/csv/', fname)
    print('start cut doc in {}...'.format(path))
    df = append_cut_words(path, keep_name=False, reset=True, saved=False)
    to_path = os.path.join('../data/csv/without_name/', fname)
    df.to_csv(to_path, index=False)

start cut doc in ../data/csv/train.csv...
100%|██████████| 44974/44974 [24:26<00:00, 30.68it/s]
  0%|          | 0/5621 [00:00<?, ?it/s]start cut doc in ../data/csv/test.csv...
100%|██████████| 5621/5621 [03:13<00:00, 29.11it/s]
  0%|          | 0/5624 [00:00<?, ?it/s]start cut doc in ../data/csv/valid.csv...
100%|██████████| 5624/5624 [03:00<00:00, 31.22it/s]
