In [11]:
import os
import sys
import gensim
import jieba
import pandas as pd
import numpy as np
import joblib as jl
from scipy.sparse import csc_matrix
from collections import defaultdict, Counter

In [2]:
def read_file(path):
    with open(path, 'rb') as f:
        return f.read()

def load_data(root_path):
    train_path = os.path.join(root_path, '20news-bydate-train')
    test_path = os.path.join(root_path, '20news-bydate-test')
    train_topic = os.listdir(train_path)
    test_topic = os.listdir(test_path)
    topic = train_topic
    assert(len(train_topic) == len(test_topic))
    assert(set(train_topic)-set(test_topic) == set())
    
    topic_id_map = dict(zip(train_topic, range(len(train_topic))))
    
    assert isinstance(topic, list)
    train_data_dir = [os.path.join(train_path, _) for _ in topic]
    test_data_dir = [os.path.join(test_path, _) for _ in topic]
    
    train_data = [read_file(os.path.join(_, __)) for _ in train_data_dir 
                  for __ in os.listdir(_)]                               
    test_data = [read_file(os.path.join(_, __)) for _ in test_data_dir 
                 for __ in os.listdir(_)]                               
                                   
    train_label = [topic_id_map[top] for top in topic 
                   for _ in range(len(os.listdir(os.path.join(train_path, top))))]
    test_label = [topic_id_map[top] for top in topic 
                   for _ in range(len(os.listdir(os.path.join(test_path, top))))]
    
    print ("train data:%d" %len(train_data))
    print ("train label:%d" %len(train_label))
    print ("test data:%d" %len(test_data))
    print ("test label:%d" %len(test_data))
    return train_data, train_label, test_data, test_label

In [3]:
root_path = "./data/20news-bydate"
train_data, train_label, test_data, test_label = load_data(root_path)

train data:11314
train label:11314
test data:7532
test label:7532


In [5]:
with open('./data/stop_word.txt', 'r') as f:
    stop_word = set([_.strip() for _ in f])

In [6]:
def doc_cut(docs):
    docs = [_.lower() for _ in docs]
    docs = [list(jieba.cut(_)) for _ in docs]
    return docs

def doc_filter(docs):
    res = []
    for doc in docs:
        doc = filter(lambda x:x.isalnum(), doc)
        doc = filter(lambda x:not x.isdigit(), doc)
        doc = filter(lambda x:x not in stop_word, doc)
        res.append(list(doc))
    return res

In [7]:
train_data = doc_cut(train_data)
train_datac = doc_filter(train_data)

test_data = doc_cut(test_data)
test_datac = doc_filter(test_data)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.656 seconds.
Prefix dict has been built succesfully.


In [8]:
dictionary = gensim.corpora.Dictionary(train_data)
dictionary.filter_extremes(no_below=5, keep_n=100000)

## build train test set for Text-CNN

In [32]:
def build_train_test_set_id(docs, dictionary, length = 1024):
    res = []
    for doc in docs:
        vec = dictionary.doc2idx(doc)
        vec = [_ for _ in vec if _ != -1]
        while(len(vec) < length):
            vec += vec
        vec = vec[:length]
        res.append(vec)
    return res
            

In [33]:
train_set = build_train_test_set_id(train_datac, dictionary)
test_set = build_train_test_set_id(test_datac, dictionary)

In [39]:
trainset = np.array(train_set)
testset = np.array(test_set)

trainlabel = np.array(train_label)
testlabel = np.array(test_label)

In [42]:
jl.dump([trainset, trainlabel, testset, testlabel], './data/data-id.jl.z')

['data.jl.z']

## build train test set for BOW

In [54]:
def build_train_test_set_bow(docs, dictionary):
    res = []
    for doc in docs:
        vec = dictionary.doc2bow(doc)
        res.append(vec)
    return res

def build_sparse_metrix(vecs, max_feature = -1):
    row = []
    col = []
    data = []
    max_feat = 0
    for vec_index, vec in enumerate(vecs):
        for vec_pos, vec_val in vec:
            if max_feature!=-1 and vec_pos < max_feature:
                row.append(vec_index)
                col.append(vec_pos)
                data.append(vec_val)
                max_feat = max_feature
            elif max_feature == -1:
                row.append(vec_index)
                col.append(vec_pos)
                data.append(vec_val)
                max_feat = max(max_feat, vec_pos)
                
    sparse_matrix = csc_matrix((data, (row, col)), shape=(len(vecs), max_feat+1))        
    return sparse_matrix

In [83]:
dictionary.filter_extremes(no_below=10)
print ("keep word cnt:%d" %len(dictionary.keys()))

keep word cnt:15773


In [19]:
train_set = build_train_test_set_bow(train_data, dictionary)
test_set = build_train_test_set_bow(test_data, dictionary)

In [84]:
trainset = build_sparse_metrix(train_set, max_feature=15773)
testset = build_sparse_metrix(test_set, max_feature=15773)

trainlabel = np.array(train_label)
testlabel = np.array(test_label)

In [88]:
jl.dump([trainset, trainlabel, testset, testlabel], './data/data-bow.jl.z')

['./data/data-bow.jl.z']