In [1]:
import os
import time
from collections import Counter
import jieba
from jieba import posseg as pseg
import pickle

In [2]:
def load_file(file_name):
    with open(file_name, 'rb') as f:
        lines = f.readlines()
        segs = []
        sentences = []
        for line in lines:
            line = line.decode('utf-8').rstrip()
            for w in line:
                segs.append(w)
            sentences.append(line)
    return segs, sentences            

In [3]:
RAW_FILE = './corpus/ssa.out'

In [4]:
LINE_BREAK = u'<Break>'
WORD_DELIMITER = u'/'
UNK_WORD = u'<UNK>'
PADDING_WORD = u'<PAD>'
START_WORD = u'<GO>'
END_WORD = u'<EOS>'

In [5]:
raw_words, raw_sentences = load_file(RAW_FILE)

In [6]:
print(len(raw_words))
print(len(raw_sentences))
print(raw_sentences[0])

9223496
941009
我  式森和树   17岁


In [7]:
def build_dataset(words, sentences, vocab_size):
    word_cnt = Counter(words).most_common(vocab_size - 4)   
    word_dict = dict()  
    word_dict[PADDING_WORD] = 0 
    for word, _ in word_cnt:
        word_dict[word] = len(word_dict)    
    word_dict[UNK_WORD] = len(word_dict)
    word_dict[START_WORD] = len(word_dict)
    word_dict[END_WORD] = len(word_dict)
    sentence_data = []
    unk_count = 0
    for sentence in sentences:
        cur_sentence = []
        for w in sentence:
            if w in word_dict.keys():
                cur_sentence.append(word_dict[w])
            else:
                cur_sentence.append(word_dict[UNK_WORD])
        sentence_data.append(cur_sentence)    
    reverse_dict = dict(zip(word_dict.values(), word_dict.keys())) 
    return sentence_data, word_cnt, word_dict, reverse_dict

In [8]:
VOCAB_SIZE = 5000


In [None]:
t0 = time.time()
train_sentences, train_word_count, train_dict, train_reverse_dict = build_dataset(raw_words, raw_sentences, VOCAB_SIZE)
t1 = time.time()
print(t1-t0)

In [None]:
print(train_sentences[:3])

In [23]:
print(len(train_dict))
print(len(train_word_count))

5000
4996


In [25]:
DICT_FILE = './../results/corpus/dict_2.pkl'

In [26]:
def save_dict(wdict, cnt, file_name):
    with open(file_name, 'wb') as f:
        data = (wdict, cnt)
        pickle.dump(data, f, protocol=2)


In [27]:
t0 = time.time()
save_dict(train_dict, train_word_count, DICT_FILE)
t1 = time.time()
print(t1-t0)

0.05213809013366699


In [28]:
def load_dict(file_name):
    wdict = {}
    wcnt = Counter()
    with open(file_name, 'rb') as f:
        (wdict, wcnt) = pickle.load(f)

    rdict = dict(zip(wdict.values(), wdict.keys())) 
    return wcnt, wdict, rdict

In [29]:
t0 = time.time()
wcnt1, wdict1, rdict1 = load_dict(DICT_FILE)
t1 = time.time()
print(t1-t0)

0.005014181137084961


In [30]:
print(len(wcnt1))
print(len(wdict1))
print(len(rdict1))

for (k,v) in wcnt1[:10]:
    print(k,v)
    
for i in range(10):
    print(i, rdict1[i])

4996
5000
5000
的 315237
  245120
我 231601
. 200820
是 194274
了 158476
不 153465
你 152269
这 118252
一 108979
0 <PAD>
1 的
2  
3 我
4 .
5 是
6 了
7 不
8 你
9 这
