In [1]:
import os
import sys
import pickle
from collections import namedtuple
from datetime import datetime
import numpy as np
import apache_beam as beam
from apache_beam.transforms import util
import tensorflow as tf
import tensorflow_hub as tf_hub
import annoy
from sklearn.random_projection import _gaussian_random_matrix
import paddlehub as hub

In [2]:
embed_fn = None

def generate_embeddings(text, model_url, random_projection_matrix=None):
  # Beam will run this function in different processes that need to
  # import hub and load embed_fn (if not previously loaded)
  global embed_fn
  if embed_fn is None:
    embed_fn = tf_hub.load(model_url)
  embedding = embed_fn(text).numpy()
  if random_projection_matrix is not None:
    embedding = embedding.dot(random_projection_matrix)
  return text, embedding

In [3]:
def to_tf_example(entries):
  examples = []

  text_list, embedding_list = entries
  for i in range(len(text_list)):
    text = text_list[i]
    embedding = embedding_list[i]

    features = {
        'text': tf.train.Feature(
            bytes_list=tf.train.BytesList(value=[text.encode('utf-8')])),
        'embedding': tf.train.Feature(
            float_list=tf.train.FloatList(value=embedding.tolist()))
    }

    example = tf.train.Example(
        features=tf.train.Features(
            feature=features)).SerializeToString(deterministic=True)

    examples.append(example)

  return examples

In [4]:
# Beam 流水线
def run_hub2emb(args):
  '''Runs the embedding generation pipeline'''

  options = beam.options.pipeline_options.PipelineOptions(**args)
  args = namedtuple("options", args.keys())(*args.values())

  with beam.Pipeline(args.runner, options=options) as pipeline:
    (
        pipeline
#         | 'Read sentences from files' >> beam.io.ReadFromText(
        | '从文件读入句子' >> beam.io.ReadFromText(
            file_pattern=args.data_dir)
        | 'Batch elements' >> util.BatchElements(
            min_batch_size=args.batch_size, max_batch_size=args.batch_size)
        | 'Generate embeddings' >> beam.Map(
            generate_embeddings, args.model_url, args.random_projection_matrix)
        | 'Encode to tf example' >> beam.FlatMap(to_tf_example)
        | 'Write to TFRecords files' >> beam.io.WriteToTFRecord(
            file_path_prefix='{}/emb'.format(args.output_dir),
            file_name_suffix='.tfrecords')
    )

In [5]:
def generate_random_projection_weights(original_dim, projected_dim):
  random_projection_matrix = None
  random_projection_matrix = _gaussian_random_matrix(
      n_components=projected_dim, n_features=original_dim).T
  print("A Gaussian random weight matrix was creates with shape of {}".format(random_projection_matrix.shape))
  print('Storing random projection matrix to disk...')
  with open('random_projection_matrix', 'wb') as handle:
    pickle.dump(random_projection_matrix, 
                handle, protocol=pickle.HIGHEST_PROTOCOL)

  return random_projection_matrix

In [6]:
# model_url = 'https://tfhub.dev/google/nnlm-en-dim128/2'
# model_url = "https://tfhub.dev/google/nnlm-zh-dim128/2"
model_url = r'C:\Users\helix\Downloads\nnlm-zh-dim128_2'
projected_dim = 64

In [7]:
lac = hub.Module(name="lac")    # 加载lac模型，LAC是Lexical Analysis of Chinese的首字母缩写



In [8]:
# 分词
def lexical(texts):
    results = lac.cut(text=texts, use_gpu=False, batch_size=1, return_tag=True)
    return results

In [9]:
# 分词后 文本写入
base_path = r'.\hot_news'
time_paths = os.listdir(base_path)
time_paths
titles = []
for path in time_paths:
    file_paths = os.path.join(base_path, path)
    file_names = os.listdir(file_paths)
    for name in file_names:
        titles += [name.split('.csv')[0]]
titles = lexical(titles)
with open('text.txt', 'w', encoding='utf-8') as out_file:
    for title in titles:
        out_file.write(' '.join(title['word']) + '\n')
titles

[{'word': ['16岁', '男孩', '带', '妹妹', '野泳', '溺亡'],
  'tag': ['m', 'n', 'v', 'n', 'PER', 'v']},
 {'word': ['18日', '夜间', '起', '9', '省市', '部分', '地区', '有', '大', '暴雨'],
  'tag': ['TIME', 't', 'v', 'm', 'n', 'n', 'n', 'v', 'a', 'n']},
 {'word': ['23名', '教师', '被', '送', '东南亚', '读', '博', '？', '官方', '调查'],
  'tag': ['m', 'n', 'p', 'v', 'LOC', 'v', 'v', 'w', 'n', 'v']},
 {'word': ['25岁', '男子', '因', '长相', '成熟', '常', '被', '误', '认为', '42岁'],
  'tag': ['m', 'n', 'p', 'n', 'a', 'd', 'p', 'ad', 'v', 'm']},
 {'word': ['31', '省份', '昨', '增', '本土', '117', '+', '393例'],
  'tag': ['m', 'n', 't', 'v', 'n', 'm', 'w', 'm']},
 {'word': ['35岁', '女', '保安', '边', '带', '娃', '边', '考研', '成功', '上岸'],
  'tag': ['m', 'a', 'n', 'd', 'v', 'n', 'd', 'v', 'ad', 'v']},
 {'word': ['4S店', '回应', '销售', '与', '客户', '女儿', '发生', '关系'],
  'tag': ['n', 'v', 'v', 'p', 'n', 'n', 'v', 'n']},
 {'word': ['9种', '变异', '株', '登陆', '各省', ' ', '值得', '担心', '吗'],
  'tag': ['m', 'vn', 'n', 'v', 'n', 'w', 'v', 'vn', 'xc']},
 {'word': ['A股', '三大', '指数', '

In [58]:
len(titles), len(set(titles))

TypeError: unhashable type: 'dict'

In [10]:
import tempfile

output_dir = tempfile.mkdtemp()
original_dim = tf_hub.load(model_url)(['']).shape[1]
random_projection_matrix = None

if projected_dim:
  random_projection_matrix = generate_random_projection_weights(
      original_dim, projected_dim)

args = {
    'job_name': 'hub2emb-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S')),
    'runner': 'DirectRunner',
    'batch_size': 1024,
    'data_dir': 'text.txt',
    'output_dir': output_dir,
    'model_url': model_url,
    'random_projection_matrix': random_projection_matrix,
}

print("Pipeline args are set.")
args

A Gaussian random weight matrix was creates with shape of (128, 64)
Storing random projection matrix to disk...
Pipeline args are set.


{'job_name': 'hub2emb-220726-054615',
 'runner': 'DirectRunner',
 'batch_size': 1024,
 'data_dir': 'text.txt',
 'output_dir': 'C:\\Users\\helix\\AppData\\Local\\Temp\\tmpd9yajptw',
 'model_url': 'C:\\Users\\helix\\Downloads\\nnlm-zh-dim128_2',
 'random_projection_matrix': array([[-0.04742824,  0.06190167, -0.23076671, ...,  0.02917528,
         -0.25928597, -0.08259265],
        [ 0.04793924,  0.11276546, -0.22449292, ..., -0.06111532,
          0.02767367,  0.17275463],
        [-0.05708836,  0.23229133,  0.07621995, ..., -0.35139933,
          0.10296169,  0.19293363],
        ...,
        [-0.1453888 ,  0.04595867,  0.04800978, ..., -0.11849089,
         -0.06036516, -0.16451824],
        [ 0.03487392, -0.03669805,  0.06545562, ..., -0.05750089,
          0.05918353, -0.08098674],
        [-0.09450219,  0.06656731,  0.13598591, ..., -0.04341026,
         -0.03367222,  0.26572768]])}

In [11]:
print("Running pipeline...")
%time run_hub2emb(args)
print("Pipeline is done.")



Running pipeline...


        -0.25928597, -0.08259265],
       [ 0.04793924,  0.11276546, -0.22449292, ..., -0.06111532,
         0.02767367,  0.17275463],
       [-0.05708836,  0.23229133,  0.07621995, ..., -0.35139933,
         0.10296169,  0.19293363],
       ...,
       [-0.1453888 ,  0.04595867,  0.04800978, ..., -0.11849089,
        -0.06036516, -0.16451824],
       [ 0.03487392, -0.03669805,  0.06545562, ..., -0.05750089,
         0.05918353, -0.08098674],
       [-0.09450219,  0.06656731,  0.13598591, ..., -0.04341026,
        -0.03367222,  0.26572768]])}


Wall time: 1.47 s
Pipeline is done.


In [12]:
embed_file = os.path.join(output_dir, 'emb-00000-of-00001.tfrecords')
sample = 5

# Create a description of the features.
feature_description = {
    'text': tf.io.FixedLenFeature([], tf.string),
    'embedding': tf.io.FixedLenFeature([projected_dim], tf.float32)
}

def _parse_example(example):
  # Parse the input `tf.Example` proto using the dictionary above.
  return tf.io.parse_single_example(example, feature_description)

dataset = tf.data.TFRecordDataset(embed_file)
for record in dataset.take(sample).map(_parse_example):
  print("{}: {}".format(record['text'].numpy().decode('utf-8'), record['embedding'].numpy()[:10]))

16岁 男孩 带 妹妹 野泳 溺亡: [-0.44405052  0.3531129   0.26800418 -0.27429968  0.01456182 -0.04958998
 -0.04472568  0.00554457 -0.03389261  0.04246975]
18日 夜间 起 9 省市 部分 地区 有 大 暴雨: [ 0.02551387  0.11336418  0.2438921  -0.22233595  0.12911782 -0.0055292
  0.14693233  0.25961086 -0.20714208  0.05031455]
23名 教师 被 送 东南亚 读 博 ？ 官方 调查: [ 0.08106706  0.19661695  0.17876275  0.1535983   0.1054814   0.0735248
  0.0520003   0.11827813  0.0022237  -0.06956445]
25岁 男子 因 长相 成熟 常 被 误 认为 42岁: [-0.10604525  0.26778027  0.1457244  -0.1274168  -0.12084347 -0.00279967
  0.02119731  0.27225342  0.30516258 -0.23033373]
31 省份 昨 增 本土 117 + 393例: [-0.14986767  0.22266015 -0.06321964  0.09006175  0.08358847 -0.15165737
 -0.0774148   0.06468168 -0.07511383  0.00076797]


In [13]:
def build_index(embedding_files_pattern, index_filename, vector_length, 
    metric='angular', num_trees=100):
  '''Builds an ANNOY index'''

  annoy_index = annoy.AnnoyIndex(vector_length, metric=metric)
  # Mapping between the item and its identifier in the index
  mapping = {}

  embed_files = tf.io.gfile.glob(embedding_files_pattern)
  num_files = len(embed_files)
  print('Found {} embedding file(s).'.format(num_files))

  item_counter = 0
  for i, embed_file in enumerate(embed_files):
    print('Loading embeddings in file {} of {}...'.format(i+1, num_files))
    dataset = tf.data.TFRecordDataset(embed_file)
    for record in dataset.map(_parse_example):
      text = record['text'].numpy().decode("utf-8")
      embedding = record['embedding'].numpy()
      mapping[item_counter] = text
      annoy_index.add_item(item_counter, embedding)
      item_counter += 1
      if item_counter % 100000 == 0:
        print('{} items loaded to the index'.format(item_counter))

  print('A total of {} items added to the index'.format(item_counter))

  print('Building the index with {} trees...'.format(num_trees))
  annoy_index.build(n_trees=num_trees)
  print('Index is successfully built.')

  print('Saving index to disk...')
  annoy_index.save(index_filename)
  print('Index is saved to disk.')
  print("Index file size: {} GB".format(
    round(os.path.getsize(index_filename) / float(1024 ** 3), 2)))
  annoy_index.unload()

  print('Saving mapping to disk...')
  with open(index_filename + '.mapping', 'wb') as handle:
    pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)
  print('Mapping is saved to disk.')
  print("Mapping file size: {} MB".format(
    round(os.path.getsize(index_filename + '.mapping') / float(1024 ** 2), 2)))

In [14]:
embedding_files = "{}/emb-*.tfrecords".format(output_dir)
embedding_dimension = projected_dim
index_filename = "index"

# !rm {index_filename}
# !rm {index_filename}.mapping

%time build_index(embedding_files, index_filename, embedding_dimension)

Found 1 embedding file(s).
Loading embeddings in file 1 of 1...
A total of 2429 items added to the index
Building the index with 100 trees...
Index is successfully built.
Saving index to disk...
Index is saved to disk.
Index file size: 0.0 GB
Saving mapping to disk...
Mapping is saved to disk.
Mapping file size: 0.13 MB
Wall time: 277 ms


In [15]:
index = annoy.AnnoyIndex(embedding_dimension)
index.load(index_filename, prefault=True)
print('Annoy index is loaded.')
with open(index_filename + '.mapping', 'rb') as handle:
  mapping = pickle.load(handle)
print('Mapping file is loaded.')

Annoy index is loaded.
Mapping file is loaded.


In [16]:
def find_similar_items(embedding, num_matches=5):
  '''Finds similar items to a given embedding in the ANN index'''
  ids = index.get_nns_by_vector(
  embedding, num_matches, search_k=-1, include_distances=False)
  items = [mapping[i] for i in ids]
  return items

In [17]:
# Load the TF-Hub model
print("Loading the TF-Hub model...")
%time embed_fn = tf_hub.load(model_url)
print("TF-Hub model is loaded.")

random_projection_matrix = None
if os.path.exists('random_projection_matrix'):
  print("Loading random projection matrix...")
  with open('random_projection_matrix', 'rb') as handle:
    random_projection_matrix = pickle.load(handle)
  print('random projection matrix is loaded.')

def extract_embeddings(query):
  '''Generates the embedding for the query'''
  query_embedding =  embed_fn([query])[0].numpy()
  if random_projection_matrix is not None:
    query_embedding = query_embedding.dot(random_projection_matrix)
  return query_embedding

Loading the TF-Hub model...
Wall time: 754 ms
TF-Hub model is loaded.
Loading random projection matrix...
random projection matrix is loaded.


In [35]:
extract_embeddings("手机")

array([-0.11337818,  0.04575574,  0.04463463, -0.10445188, -0.18069048,
        0.12225582,  0.26330909,  0.01543695,  0.19818887, -0.072975  ,
       -0.02582843, -0.06593285,  0.00227983, -0.27449116, -0.04524873,
       -0.03312959, -0.08307707, -0.01929841,  0.05727324, -0.15619984,
        0.11091063, -0.1101066 , -0.16211552, -0.12401878, -0.07660548,
        0.02159346, -0.09039252,  0.02480844,  0.0110216 , -0.03137947,
       -0.0252301 , -0.03765644,  0.08342628,  0.08584248, -0.15852213,
       -0.18235968,  0.03543031, -0.13331294,  0.19678961,  0.25472635,
        0.19395137, -0.04133281, -0.17732195, -0.14478255, -0.28942486,
       -0.26368379, -0.00774473,  0.19081017, -0.12986338, -0.09469636,
        0.00750706,  0.13470999, -0.10692975,  0.08550254, -0.02688783,
        0.00403132, -0.08329207, -0.16139398, -0.05221603, -0.12629057,
        0.03035858, -0.06683127,  0.15506885, -0.1314981 ])

In [71]:
query = "台湾"

print("Generating embedding for the query...")
%time query_embedding = extract_embeddings(query)



print("")
print("Finding relevant items in the index...")
%time items = find_similar_items(query_embedding, 20)

print("")
print("Results:")
print("=========")
for item in items:
  print(item)

Generating embedding for the query...
Wall time: 999 µs

Finding relevant items in the index...
Wall time: 1 ms

Results:
中国 大陆 首次 记录 到 笑鸥
中国 大陆 首次 记录 到 笑鸥
台 专家 批 民进党 挖深 两岸 鸿沟
美国 前 国防部长 窜访 台湾
中国 内地 两周 现 10种 奥密克戎变异株
墨西哥 替代 中国 占 美国 彩电 六成 市场
安徽 蚌埠 ： 将 对 5 - 10万 客户 本金 垫付
广东 昨 增 本土 23 + 9例   在 深圳 、 中山
台湾 台东县 海域 发生 4.7级 地震
湖北 通缉 12名 涉 重大 刑案 在逃 人员
湖北 通缉 12名 涉 重大 刑案 在逃 人员
台湾 渔民 钓 起 玳瑁 石斑   称 50年 首见
河北 武安 发现 明朝 万历 年间 圣旨
美英 情报 官员 为何 频炒 所谓 大陆 攻 台
陆生 恐 成 绝响   台湾 高教 遭 双重 夹击
LADY M 将 终止 内地 所有 实体 门店
LADY M 将 终止 内地 所有 实体 门店
第五届 数字 中国 建设 成果 展 福州 开展
国台办 批 欧洲 议会 副议长 窜访 台湾
国台办 批 欧洲 议会 副议长 窜访 台湾
