# 1. 数据处理，根据当前场景语料生成腾讯词向量和vocab_words.pkl

In [53]:
import gensim
import numpy as np
import torch

In [2]:
from gensim.models import KeyedVectors
file = '~/pretrained/tencent_embedding/Tencent_AILab_ChineseEmbedding.txt'
wv_from_text = KeyedVectors.load_word2vec_format(file, binary=False) # 加载时间比较长

## 1.1 根据场景语料build vocab

In [9]:
import os
import torch
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from datetime import timedelta
import jieba
jieba.initialize()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/p5/knwww3f575n5ykrj4r0706d40000gn/T/jieba.cache
Loading model cost 0.738 seconds.
Prefix dict has been built succesfully.


In [None]:
# train.txt文件格式
'''
体验2D巅峰 倚天屠龙记十大创新概览	8
60年铁树开花形状似玉米芯(组图)	5
同步A股首秀：港股缩量回调	2
中青宝sg现场抓拍 兔子舞热辣表演	8
锌价难续去年辉煌	0
'''

In [10]:
tokenizer = jieba.lcut
train_path = ['./THUCNews/data/train.txt',
             './THUCNews/data/dev.txt',
             './THUCNews/data/test.txt']
vocab_dir = "./THUCNews/data/vocab_words.pkl"
UNK, PAD = '<UNK>', '<PAD>'

In [72]:
def build_vocab(files_path, tokenizer, max_size, min_freq):
    #max_size取None则表示不指定max size大小
    vocab_dic = {}
    if isinstance(files_path, str):
        raise ValueError('please input paths in list format!')
    for file_path in files_path:
        with open(file_path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                content = lin.split('\t')[0]
                for word in tokenizer(content.replace(' ','')):
                    vocab_dic[word] = vocab_dic.get(word, 0) + 1
    vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
    vocab_list = [(PAD, -1), (UNK, -1)] + vocab_list
    vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
    return vocab_dic

In [84]:
word_to_id = build_vocab(train_path, tokenizer, None, 20)
len(word_to_id)

In [None]:
# 保存
pkl.dump(word_to_id, open(vocab_dir, 'wb'))

In [13]:
### 如果上边build vocab已经运行过一次，则直接加载即可
if os.path.exists(vocab_dir):
    word_to_id = pkl.load(open(vocab_dir, 'rb'))

## 1.2 根据vocab生成腾讯预训练向量并保存

In [44]:
emb_dim = 200
filename_trimmed_dir = "./THUCNews/data/embedding_tencent_words"

embeddings = np.random.rand(len(word_to_id), emb_dim)

In [46]:
for k,v in tqdm(word_to_id.items()):
    if k in wv_from_text:
        embeddings[v] = np.asarray(wv_from_text[k], dtype='float32') 

100%|██████████| 12062/12062 [00:00<00:00, 193283.24it/s]


In [52]:
#保存
np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)

In [54]:
#加载
emb_tc = torch.tensor(np.load(filename_trimmed_dir + '.npz')["embeddings"].astype('float32'))

# 2. 数据处理，根据当前场景语料生成字向量和vocab_chars.pkl

In [1]:
import gensim
fd='~/pretrained/sogou_embedding/sgns.sogou.char.bz2'
model = gensim.models.KeyedVectors.load_word2vec_format(fd, encoding = "utf-8")

## 2.1 根据场景语料build vocab

In [2]:
import os
import torch
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from datetime import timedelta

In [None]:
# train.txt文件格式
'''
体验2D巅峰 倚天屠龙记十大创新概览	8
60年铁树开花形状似玉米芯(组图)	5
同步A股首秀：港股缩量回调	2
中青宝sg现场抓拍 兔子舞热辣表演	8
锌价难续去年辉煌	0
'''

In [6]:
tokenizer_char = lambda x: [y for y in x]
train_path = ['./THUCNews/data/train.txt',
             './THUCNews/data/dev.txt',
             './THUCNews/data/test.txt']
vocab_dir = "./THUCNews/data/vocab_chars.pkl"
UNK, PAD = '<UNK>', '<PAD>'

In [7]:
def build_vocab_char(files_path, tokenizer, max_size, min_freq):
    #max_size取None则表示不指定max size大小
    vocab_dic = {}
    if isinstance(files_path, str):
        raise ValueError('please input paths in list format!')
    for file_path in files_path:
        with open(file_path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                content = lin.split('\t')[0]
                for word in tokenizer_char(content.replace(' ','')):
                    vocab_dic[word] = vocab_dic.get(word, 0) + 1
    vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
    vocab_list = [(PAD, -1), (UNK, -1)] + vocab_list
    vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
    return vocab_dic

In [11]:
char_to_id = build_vocab_char(train_path, tokenizer_char, None, 1)
len(char_to_id)

180000it [00:01, 179433.89it/s]
10000it [00:00, 146745.83it/s]
10000it [00:00, 121749.42it/s]


4802

In [13]:
# 保存
pkl.dump(char_to_id, open(vocab_dir, 'wb'))

In [None]:
### 如果上边build vocab已经运行过一次，则直接加载即可
if os.path.exists(vocab_dir):
    char_to_id = pkl.load(open(vocab_dir, 'rb'))

## 2.2 根据vocab生成搜狗预训练字向量并保存

In [26]:
emb_dim = 300
filename_trimmed_dir = "./THUCNews/data/embedding_sogou_chars"
embeddings = np.random.rand(len(char_to_id), emb_dim)

In [28]:
for k,v in tqdm(char_to_id.items()):
    if k in model:
        embeddings[v] = np.asarray(model[k], dtype='float32') 

100%|██████████| 4802/4802 [00:00<00:00, 115349.43it/s]


In [31]:
#保存
np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)

In [None]:
#加载
emb_sg = torch.tensor(np.load(filename_trimmed_dir + '.npz')["embeddings"].astype('float32'))