# Prepare

In [1]:
import pandas as pd
import numpy as np
import os
import re
import sys
import json
import pathlib
import importlib
import pickle
import datetime
import logging

import tensorflow as tf
import transformers
from transformers import (
    BertTokenizer, TFBertModel, BertConfig, BertModel, BertForPreTraining, TFBertForPreTraining, 
    TFAlbertModel, TFAlbertForPreTraining, AlbertConfig
)
from tokenizers import BertWordPieceTokenizer

In [2]:
CUSTOM_MODULE_PATH = os.path.realpath("/Users/ccuulinay/github_proj/py_public_modules")
if CUSTOM_MODULE_PATH not in sys.path:
    sys.path.append(CUSTOM_MODULE_PATH)

In [3]:
from tj_hub.tf_helper import input_builder

importlib.reload(input_builder)

<module 'tj_hub.tf_helper.input_builder' from '/Users/ccuulinay/github_proj/py_public_modules/tj_hub/tf_helper/input_builder.py'>

# Params

In [4]:
today_dt = datetime.datetime.today()

today_yyyymmdd = today_dt.strftime("%Y%m%d")

In [5]:
max_seq_len = 128
segment_input_flag = True

# Load albert params and model

In [6]:
# local model
model_dir = "/Users/ccuulinay/PRETRAIN_MODELS/albert_base_zh_additional_36k_steps/"
model_ckpt = pathlib.Path(model_dir) / "albert_model.ckpt.index"
model_config = pathlib.Path(model_dir) / "albert_config_base.json"
vocab_file = pathlib.Path(model_dir) / "vocab.txt"

In [7]:
albert_config = AlbertConfig.from_pretrained(model_config)
encoder = TFAlbertModel(albert_config)
transformers.load_tf_weights_in_albert(encoder, albert_config, model_ckpt)

2022-05-25 22:06:58.437843: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<transformers.models.albert.modeling_tf_albert.TFAlbertModel at 0x7fd15318a6d0>

In [8]:
tokenizer = BertWordPieceTokenizer(str(vocab_file))
tokenizer.enable_truncation(max_seq_len)

In [9]:
doc = f"近日，建设银行与中国REITs论坛在京联合举办保障性租赁住房"

In [24]:
input_word_ids = tf.keras.layers.Input(
    shape=(max_seq_len,), dtype=tf.int32, name="input_word_ids"
)
input_mask = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32,
                                   name="attention_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32,
                                    name="token_type_ids")

# albert_layer = l_bert

if segment_input_flag:
    inputs_list = [input_word_ids, segment_ids]
else:
    inputs_list = input_word_ids


encoder_output = encoder(
    # [input_word_ids, segment_ids]
    input_word_ids, 
    token_type_ids=segment_ids, 
    attention_mask=input_mask
)

seq_out = encoder_output.last_hidden_state

pooled = tf.keras.layers.GlobalAveragePooling1D()(seq_out)
# flat = tf.keras.layers.Flatten()(seq_ouput)
# logits = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)(pooled)

In [25]:
cls = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled, seq_out])

In [35]:
print(cls.summary())

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_albert_model (TFAlbertModel  TFBaseModelOutputWi  10547968   ['input_word_ids[0][0]',         
 )                              thPooling(last_hidd               'attention_mask[0][0]',   

In [27]:
cls.save(f"./models/albert_encoder_{today_yyyymmdd}")





INFO:tensorflow:Assets written to: ./models/albert_encoder_20220525/assets


INFO:tensorflow:Assets written to: ./models/albert_encoder_20220525/assets


In [28]:
doc = """
近日，建设银行与中国REITs论坛在京联合举办保障性租赁住房REITs研讨会，深入探讨保障性租赁住房REITs发展经验与挑战，共同探寻保障性租赁住房REITs落地的可行路径。  与会嘉宾建议，发展保障性租赁住房REITs应立足市场化定位，从盘活存量出发，多渠道拓展资产来源；根据建设主体及资产运营特点，因地制宜建立认定标准，优选资产进行REITs试点；不断完善资产估值方法和租金调整机制，探索保障性租赁住房REITs市场化定价机制；逐步推进税收政策、土地政策等配套制度落地，推动保障性租赁住房REITs健康持续发展。  据介绍，建行高度重视保障性租赁住房市场建设，持续推动保障性租赁住房REITs试点，以住房租赁领域实践经验为监管部门出台政策提供参考，充分发挥母子公司协同效应，会同建信基金、建信信托、建信住房等子公司以专业的基金管理人、重要的市场投资人、优质的资产持有人、高效的资产托管人等多重角色，积极参与保障性租赁住房公募REITs试点各业务环节。
"""

In [29]:
a = input_builder.preproc_text_to_bert_input(
    doc,
    tokenizer,
    chunk_size=50
)

100%|██████████| 9/9 [00:00<00:00, 384.27it/s]


In [30]:
embed_outs = cls.predict(a)
embed_outs[0].shape

(9, 768)

In [34]:
np.mean(embed_outs[0], axis=0).shape

(768,)

In [31]:
cv_cls = tf.keras.models.load_model("./models/albert_encoder_20220525/")



