In [1]:
import pickle
from sklearn.model_selection import train_test_split
import random

In [2]:
class Word:
    def __init__(self, val, tf, df):
        self.val = val
        self.tf = tf
        self.df = df

In [3]:
def read_data(data_fname):
    abstracts, titles = [], []
    vocab = set()
    with open(data_fname, 'r', encoding="utf-8") as f:
        for idx, line in enumerate(f):
            if idx % 50000 == 0: print("progress:{0}".format(idx))
            try:
                title, abstract = line.strip().split("\t")
            except:
                continue
            titles.append(title.split(" "))
            abstracts.append(abstract.split(" "))
            vocab.update(title.split(" "), abstract.split(" "))
    return titles, abstracts, vocab

In [None]:
data_fname = "../data/processed.s2s.v1.txt"
titles, abstracts, vocab = read_data(data_fname)
print("Got {0} doc in corpus".format(len(titles)))

progress:0
progress:50000
progress:100000
progress:150000
progress:200000
progress:250000
progress:350000
progress:400000
progress:450000
progress:500000
progress:550000
progress:650000
progress:700000
progress:750000
progress:800000
progress:850000
progress:900000
progress:950000
progress:1000000
progress:1050000
progress:1100000
progress:1150000
progress:1200000
progress:1250000
progress:1300000
progress:1350000
progress:1400000
progress:1450000
progress:1500000
Got 1513342 doc in corpus


In [None]:
def cal_word_tf_df(corpus):
    word_set = {}
    for doc in corpus:
        title, abstract = doc[0], doc[1]
        words = title + abstract
        for w in words:
            if w not in word_set:
                word_set[w] = Word(val=w, tf=1, df=0)
            else:
                word_set[w].tf += 1
        for w in set(words):
            word_set[w].df += 1

    return word_set

In [None]:

def build_idx_for_word_tf_df(word_set, tf_thres=12, df_thres=6):
        
    word_list = list(filter(lambda w: w.tf > tf_thres and w.df > df_thres, word_set.values()))
    
    return word_list

In [None]:
corpus = zip(titles, abstracts)
word_set = cal_word_tf_df(corpus)
print("Got {0} unique word".format(len(word_set)))

Got 2663659 unique word


In [None]:
word_list = build_idx_for_word_tf_df(word_set)
top_tf_words = sorted(word_list, key=lambda x: x.tf, reverse=True)
print("The Top 10 are: ")
print("\n".join(["{0}\t{1}\t{2}".format(word.val, word.tf, word.df) for word in top_tf_words[:10]]))
abstract_train, abstract_test, title_train, title_test = train_test_split(abstracts, titles, test_size=1000)
print("X_train length: {0}\nX_test length: {1}\nY_train length: {2}\nY_test length: {3}".format(len(abstract_train),
                                                                                                len(abstract_test),
                                                                                                len(title_train),
                                                                                                len(title_test)
                                                                                                   ))

The Top 10 are: 
，	87890784	1493097
的	49538168	1471616
。	33842224	1478923
、	15220155	1191988
在	10334975	1336449
是	10055198	1259035
了	8448966	1223957
：	6286885	1183965
和	5949918	1134375
年	4675176	811846
X_train length: 1512342
X_test length: 1000
Y_train length: 1512342
Y_test length: 1000


In [None]:
data_fn = "../data/finance150.batch.pkl"
with open(data_fn, 'wb') as f:
    pickle.dump((abstract_train, abstract_test, title_train, title_test), f, -1)


NameError: name 'top_of_words' is not defined

In [12]:
print("vocab size: {0}".format(len(top_tf_words)))
vocab_fn = "../data/finance150.vocab"
with open(vocab_fn, "w", encoding="utf-8") as f:
    for w in top_tf_words:
        f.write(w.val+"\n")
        f.flush()
idx = random.randint(0, len(abstract_train))
abstract_train[idx]
title_train[idx]

vocab size: 378482


['来自', '外汇交易', '高手', '的', '三点', '“', '忠告', '”']