# NLP算法工具集

In [1]:
from IPython.display import Image

# tfidf
# LDA
# textrank
# word2vec
# doc2vec
# bert
# s-bert


# 文本预处理层

## 去除特殊符号

In [2]:
import re
import emoji

#根据文本的特征，确定需要处理的一些特殊符号
def preprocess4wb(text_content):

    # 过滤文本中的html链接等
    re_tag = re.compile('</?\w+[^>]*>')  # HTML标签
    new_text = re.sub(re_tag, '', text_content)
    new_text = re.sub(u"\\[.*?]", "", new_text) #去除[]及其内部内容（表情等会是这样）
    new_text = emoji.replace_emoji(new_text,'')  # 去除 emoji
    new_text = re.sub(",+", ",", new_text)  # 合并逗号
    new_text = re.sub(" +", " ", new_text)  # 合并空格
    new_text = re.sub("[...|…|。。。]+", "...", new_text)  # 合并句号
    new_text = re.sub("-+", "--", new_text)  # 合并-
    text_content = re.sub("———+", "———", new_text)  # 合并-
    topic = re.findall('#[^#]+#', str(text_content))#去除话题标识
    name = re.findall('@[\\u4e00-\\u9fa5\\w\\-]+', str(text_content)) #去除艾特
    if topic!=None:
        for tp in topic:
            text_content=text_content.replace(tp,"")
    if name!=None:
        for nm in name:
            text_content=text_content.replace(nm,"")
    text_content=text_content.replace(" ","")
    punctuation = '~`!#$%^&*()_+-=|\';":/.,?><~·！@#￥%……&*（）——+-=“：’；、。，？》《{}「」【】'
    text_content = re.sub(r"[%s]+" %punctuation, "",text_content)
    return text_content

In [3]:
text='？「」[asda]123这是一个测...--..。。。。试文本<fffff> #话题# @23海贼王🙃😍😅😘😒 '
rs=preprocess4wb(text)
rs

'这是一个测试文本'

## 分词算法

In [4]:
import jieba

load_dict_flag = False
def load_dict():
    # 加载词典
    if not load_dict_flag:
        jieba.load_userdict("../data/dict/SogouLabDic.txt")
        jieba.load_userdict("../data/dict/dict_car.txt")
        jieba.load_userdict("../data/dict/dict_baidu_utf8.txt")
        jieba.load_userdict("../data/dict/dict_pangu.txt")
        jieba.load_userdict("../data/dict/dict_sougou_utf8.txt")
        jieba.load_userdict("../data/dict/dict_tencent_utf8.txt")


def cut_wd(content):
    load_dict()
    stopwords = {}.fromkeys([line.rstrip() for line in open('../data/dict/Stopword.txt',encoding='utf-8')])
    seg = jieba.cut(content)
    wordslist=[]
    for i in seg:
        if i not in stopwords:
            wordslist.append(i)
#     wordstext=" ".join(wordslist)
    return wordslist
text='本文将详细介绍生成对抗网络 – GAN 的设计初衷、基本原理、10种典型算法和13种实际应用'
wdlist=cut_wd(text)
print(wdlist)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/cv/39t5r19s6s94k9_p328_hnv00000gp/T/jieba.cache
Loading model cost 0.324 seconds.
Prefix dict has been built successfully.


['本文', '详细介绍', '生成', '对抗', '网络', ' ', '–', ' ', 'GAN', ' ', '设计', '初衷', '基本原理', '10', '典型', '算法', '13', '实际应用']


## 分句算法

In [5]:
def cut_sentences(content):
    # 结束符号，包含中文和英文的,依据具体的需求设定划分依据
    end_flag = ['?', '!', '.', '？', '！', '。',';', '…', '、']

    content_len = len(content)
    sentences = []
    tmp_char = ''
    for idx, char in enumerate(content):
        # 拼接字符
        tmp_char += char

        # 判断是否已经到了最后一位
        if (idx + 1) == content_len:
            sentences.append(tmp_char)
            break

        # 判断此字符是否为结束符号
        if char in end_flag:
            # 再判断下一个字符是否为结束符号，如果不是结束符号，则切分句子
            next_idx = idx + 1
            if not content[next_idx] in end_flag:
                sentences.append(tmp_char)
                tmp_char = ''

    return sentences

text='本文将详细介绍生成对抗网络 – GAN 的设计初衷、基本原理、10种典型算法和13种实际应用'
rs=cut_sentences(text)
print(rs)

['本文将详细介绍生成对抗网络 – GAN 的设计初衷、', '基本原理、', '10种典型算法和13种实际应用']


## 实体提取-词性分析

In [6]:
# 引入词性标注接口
import jieba.posseg as psg
 
text = '''网友们都清楚，中国互联网行业之所以能媲美美国互联网行业，一方面是得益于中国互联网在商业模式上的创新，另一方面则是受益于资本的驱动。
阿里巴巴这家公司早年也差钱，马云的阿里获得了孙正义软银的2000万美元投资。自此阿里巴巴的B2B业务开始进军国际市场，并在国际市场上打出了名气。
同样的，腾讯公司在早期也是获得了资本的支持，才取得了快速发展。腾讯的大股东是南非报业集团。
大股东Naspers是一家1915年成立，总部位于南非的传媒集团。2001年购买了腾讯的股份，随着腾讯市值飙涨，Naspers成为整个非洲市值最大的上市公司，其腾讯的股权市值甚至超过了自身业务的市值。
从以上数据可以看出，马化腾加上马化腾基金的股份，也持股不到9%，像腾讯早期的五虎，大多也套现不少，并没有出现在10大股东的榜单之中。'''
#词性标注
seg = psg.cut(text)
 
#将词性标注结果打印出来
for word,flag in seg:
    if flag in ('ns', 'nr', 'nt'):
        print (flag, '->', word)

ns -> 美国
nr -> 马云
nr -> 孙正义
ns -> 南非
ns -> 南非
ns -> 非洲
nt -> 上市公司
nr -> 马化腾
nr -> 马化腾


# 编码

## TFIDF

In [7]:
# TF-IDF（Term Frequency–Inverse Document Frequency）是一种用于资讯检索与文本挖掘的常用加权技术。
# TF-IDF是一种统计方法，用以评估一个字词对于一个文件集或一个语料库中的其中一份文件的重要程度。
# 字词的重要性随着它在文件中出现的次数成正比增加，但同时会随着它在语料库中出现的频率成反比下降。
# TF-IDF加权的各种形式常被搜索引擎应用，作为文件与用户查询之间相关程度的度量或评级。

# TF-IDF的主要思想是：如果某个词或短语在一篇文章中出现的频率TF高，并且在其他文章中很少出现，则认为此词或者短语具有很好的类别区分能力，适合用来分类。
# TF-IDF实际上是：TF * IDF。

from sklearn.feature_extraction.text import TfidfVectorizer
import timeit
vectorizer = TfidfVectorizer()   # 定义一个tf-idf的vectorizer

# 构建TFIDF矩阵，方便后续对比使用
def TfidfVector(wdlist):
    start = timeit.default_timer()
    X_tfidf = vectorizer.fit_transform(wdlist)   # 结果存放在X矩阵
    stop = timeit.default_timer()
    print('TfidfVector_Sklean Time: ', stop - start)
    return X_tfidf

def seg_depart(sentence):
    # 对文档中的每一行进行中文分词

    sentence=preprocess4wb(sentence)
    sentence_depart = cut_wd(sentence.strip())
    return sentence_depart

#测试使用，文本首先进行上面第一步的预处理
quelist=['网友们都清楚，中国互联网行业之所以能媲美美国互联网行业，一方面是得益于中国互联网在商业模式上的创新，另一方面则是受益于资本的驱动。',
'阿里巴巴这家公司早年也差钱，马云的阿里获得了孙正义软银的2000万美元投资。自此阿里巴巴的B2B业务开始进军国际市场，并在国际市场上打出了名气。',
'同样的，腾讯公司在早期也是获得了资本的支持，才取得了快速发展。腾讯的大股东是南非报业集团。',
'大股东Naspers是一家1915年成立，总部位于南非的传媒集团。2001年购买了腾讯的股份，随着腾讯市值飙涨，Naspers成为整个非洲市值最大的上市公司，其腾讯的股权市值甚至超过了自身业务的市值。',
'从以上数据可以看出，马化腾加上马化腾基金的股份，也持股不到9%，像腾讯早期的五虎，大多也套现不少，并没有出现在10大股东的榜单之中。',]
wdlist=[]
for ques in quelist:
    wd=seg_depart(ques)
    wdstr=",".join(wd)
    wdlist.append(wdstr)
print(wdlist)

X_tfidf = TfidfVector(wdlist)

['网友,中国互联网,行业,媲美,美国,互联网,行业,得益于,中国互联网,商业模式,创新,则是,受益,资本,驱动', '阿里巴巴,这家,公司,早年,差钱,马云,阿里,获得了,孙正义,软银,万美元,投资,自此,阿里巴巴,BB,业务,进军,国际市场,并在,国际市场,打出了,名气', '腾讯,公司,早期,获得了,资本,支持,取得了,快速发展,腾讯,大股东,南非,报业集团', '大股东,Naspers,是一家,年,成立,总部,位于,南非,传媒,集团,年,购买了,腾讯,股份,腾讯,市值,飙涨,Naspers,非洲,市值,上市公司,腾讯,股权,市值,超过了,业务,市值', '数据,可以看出,马化腾,马化腾,基金,股份,持股,不到,腾讯,早期,五虎,套现,出现在,大股东,榜单,之中']
TfidfVector_Sklean Time:  0.0015467079999993416


In [8]:
X_tfidf.shape

(5, 67)

In [9]:
X_tfidf.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.46310547, 0.        , 0.23155274, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.23155274,
        0.23155274, 0.        , 0.        , 0.23155274, 0.        ,
        0.        , 0.23155274, 0.        , 0.        , 0.        ,
        0.        , 0.23155274, 0.        , 0.        , 0.        ,
        0.        , 0.23155274, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.23155274, 0.23155274, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.46310547, 0.        , 0.18681529,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.23155274],
       [0.20018928, 0.        , 0.20018928, 0.        , 0.        ,
        0.16151

## word2vec

In [10]:
import gensim
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence


# inp为输入语料, outp1为输出模型, outp2为vector格式的模型
input_file = '../data/corpus.txt'
out_model = '../data/corpus.model'
out_vector = '../data/corpus.vector'

# 训练skip-gram模型，16维，滑动窗口3，出现过1词就保存该词
model = Word2Vec(LineSentence(input_file), vector_size=16, window=3, min_count=1, workers=multiprocessing.cpu_count())

# 保存模型
model.save(out_model)
# 保存词向量
model.wv.save_word2vec_format(out_vector, binary=False)


#调用训练好的word2vec模型进行向量化

import gensim
word2vec = Word2Vec.load("../data/corpus.model")

# query="中国互联网行业之所以能媲美美国互联网行业"
# querywd=seg_depart(query)
print(word2vec.wv['林肯'])

    

[-0.00545364 -0.00149582  0.03971366  0.06125599 -0.05921654 -0.04765382
  0.05539985  0.05652928 -0.03096416 -0.02589266  0.04575386 -0.00653471
 -0.02678722  0.03472119 -0.04110786 -0.01157157]


## Bert句向量

In [12]:
"""
Sentences are mapped to sentence embeddings
"""
from sentence_transformers import SentenceTransformer

# 这里可以选用不同的开源出来的预训练模型，无非就是模型大小不同，最后的向量表征不同
# embedder = SentenceTransformer('bert-base-nli-mean-tokens')
embedder = SentenceTransformer('hfl/chinese-electra-180g-small-generator')
corpus=['奔驰','宝马']
corpus_embeddings = embedder.encode(corpus)
print("len of emb is =====", len(corpus_embeddings))
print(corpus_embeddings[0])

Downloading: 100%|██████████| 345/345 [00:00<00:00, 351kB/s]
Downloading: 100%|██████████| 1.99k/1.99k [00:00<00:00, 399kB/s]
Downloading: 100%|██████████| 2.00/2.00 [00:00<00:00, 405B/s]
Downloading: 100%|██████████| 556/556 [00:00<00:00, 292kB/s]
Downloading: 100%|██████████| 13.7M/13.7M [00:09<00:00, 1.47MB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 21.8kB/s]
Downloading: 100%|██████████| 269k/269k [00:00<00:00, 281kB/s]  
Downloading: 100%|██████████| 19.0/19.0 [00:00<00:00, 6.98kB/s]
Downloading: 100%|██████████| 110k/110k [00:00<00:00, 152kB/s]  
No sentence-transformers model found with name /Users/zhangguoqiang/.cache/torch/sentence_transformers/hfl_chinese-electra-180g-small-generator. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/zhangguoqiang/.cache/torch/sentence_transformers/hfl_chinese-electra-180g-small-generator were not used when initializing ElectraModel: ['generator_predictions.LayerNorm.bias', 'generator_lm_head.wei

len of emb is ===== 2
[ 0.15300061 -0.09790983  0.86315787  0.04022079  0.34487978  0.04289062
 -0.15589474 -0.39481714  0.47034568 -0.38537207 -0.07860257  0.09301241
  0.20630446 -0.34668624 -0.09810054  0.41245574 -0.15701818 -0.3200327
 -1.0217427  -0.05748152  0.52095217 -0.27196616  0.14191824 -0.18743883
  0.08644073 -0.08078568 -0.01523015 -0.38654917 -0.37808013  0.0541053
  0.40847737  0.05551349  0.06039735 -0.0669658   0.25195187 -0.16943024
  0.17551786  0.00963114 -0.31012657  0.19530302 -0.1198843   0.09573595
 -0.05866819 -0.2962663   0.3415701  -0.3293378  -0.09927104  0.36055034
  0.11020167  0.24922141 -0.07156698  0.25878298  0.2994059   0.07062481
 -0.2730424   0.12223209 -0.21613425 -0.26697043 -0.03332004 -0.10973013
  0.61800194  0.26704264 -0.14203128 -0.3525691 ]


# 离线训练层

## DFA算法（文本匹配关键词）

In [13]:
# DFA算法
class DFAFilter():
    def __init__(self):
        self.keyword_chains = {}
        self.delimit = '\x00'

    def add(self, keyword):
        keyword = keyword.lower()
        chars = keyword.strip()
        if not chars:
            return
        level = self.keyword_chains
        for i in range(len(chars)):
            if chars[i] in level:
                level = level[chars[i]]
            else:
                if not isinstance(level, dict):
                    break
                for j in range(i, len(chars)):
                    level[chars[j]] = {}
                    last_level, last_char = level, chars[j]
                    level = level[chars[j]]
                last_level[last_char] = {self.delimit: 0}
                break
        if i == len(chars) - 1:
            level[self.delimit] = 0

    def parse(self, path):
        with open(path,encoding='utf-8') as f:
            for keyword in f:
                self.add(str(keyword).strip())

    def filter(self, message,matchType):
        message = message.lower()
        ret = []
        rskey=[]
        start = 0
        tag=0
        while start <len(message)-1:
            level = self.keyword_chains
            step_ins = 0
            for char in message[start:]:
                if char in level:
                    step_ins += 1
                    if self.delimit not in level[char]:
                        level = level[char]
                    else:
                        tag=1
                        tem=step_ins
                        #start += step_ins - 1
                        if "minMatch"==matchType:
                            rskey.append(message[start:start + step_ins])
                            start += step_ins
                            tag=0
                            break
                        level = level[char]
                else:
                    if tag!=0:
                        rskey.append(message[start:start + tem])
                        start += step_ins-1
                    tag = 0
                    start+=1
                    break

            else:
                print("ok")
                if tag != 0:
                    rskey.append(message[start:start + tem])
                start += step_ins - 1

                #ret.append(message[start])
                # start += 1
        return rskey

In [41]:
gfw = DFAFilter()
path = "../data/model/xxx.txt"
gfw.parse(path)
text="xxx"
rs=gfw.filter(text,'maxMatch')
rs

FileNotFoundError: [Errno 2] No such file or directory: '../data/model/xxx.txt'

## 提取关键词算法

### TextRank算法

In [14]:
from textrank4zh import TextRank4Keyword, TextRank4Sentence
# 关键词抽取
def keywords_extraction(text):
    tr4w = TextRank4Keyword(stop_words_file='../data/dict/Stopword.txt',allow_speech_tags=['n', 'nr', 'nrfg', 'ns', 'nt', 'nz'])

    #allow_speech_tags=['n', 'nr', 'nrfg', 'ns', 'nt', 'nz']
    # allow_speech_tags   --词性列表，用于过滤某些词性的词
    tr4w.analyze(text=text, window=3, lower=True, vertex_source='all_filters', edge_source='no_stop_words',
                 pagerank_config={'alpha': 0.85, })
    # text    --  文本内容，字符串
    # window  --  窗口大小，int，用来构造单词之间的边。默认值为2
    # lower   --  是否将英文文本转换为小写，默认值为False
    # vertex_source  -- 选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点
    #                -- 默认值为`'all_filters'`，可选值为`'no_filter', 'no_stop_words', 'all_filters'
    # edge_source  -- 选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点之间的边
    #              -- 默认值为`'no_stop_words'`，可选值为`'no_filter', 'no_stop_words', 'all_filters'`。边的构造要结合`window`参数

    # pagerank_config  -- pagerank算法参数配置，阻尼系数为0.85
    keywords = tr4w.get_keywords(num=8, word_min_len=2)
    # num           --  返回关键词数量
    # word_min_len  --  词的最小长度，默认值为1
    return keywords


# 关键短语抽取
def keyphrases_extraction(text):
    tr4w = TextRank4Keyword(stop_words_file='../data/dict/Stopword.txt',allow_speech_tags=['n', 'nr', 'nrfg', 'ns', 'nt', 'nz'])
    tr4w.analyze(text=text, window=3, lower=True, vertex_source='all_filters', edge_source='no_stop_words',
                 pagerank_config={'alpha': 0.35, })
    keyphrases = tr4w.get_keyphrases(keywords_num=20, min_occur_num=1)
    # keywords_num    --  抽取的关键词数量
    # min_occur_num   --  关键短语在文中的最少出现次数
    return keyphrases


# 关键句抽取
def keysentences_extraction(text):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, window=2, lower=True, vertex_source='all_filters', edge_source='no_stop_words',
                 pagerank_config={'alpha': 0.85, })
    keysentences = tr4s.get_key_sentences(num=3, sentence_min_len=6)
    return keysentences

text='从以上数据可以看出，马化腾加上马化腾基金的股份，也持股不到9%，像腾讯早期的五虎，大多也套现不少，并没有出现在10大股东的榜单之中。'
rs=keywords_extraction(text)
print(rs)

[{'word': '马化腾', 'weight': 0.2772961200978784}, {'word': '腾讯', 'weight': 0.1626016260162572}, {'word': '五虎', 'weight': 0.1626016260162572}, {'word': '基金', 'weight': 0.14489715979786527}, {'word': '股份', 'weight': 0.14489715979786527}, {'word': '数据', 'weight': 0.0833160643714195}, {'word': '榜单', 'weight': 0.02439024390245698}]


### TFIDF抽取

In [15]:
from jieba import analyse
def getkwTfidf(text):
    wordlist=cut_wd(text)
    wordstext=",".join(wordlist)
    tfidf = analyse.extract_tags
    keywords = tfidf(wordstext,topK=20,withWeight=True,
                     allowPOS=('ns', 'nr', 'nt', 'nz', 'nl', 'n', 'vn', 'vd', 'vg', 'v', 'vf', 'a', 'an', 'i'))
    return keywords
text='从以上数据可以看出，马化腾加上马化腾基金的股份，也持股不到9%，像腾讯早期的五虎，大多也套现不少，并没有出现在10大股东的榜单之中。'
keywords=getkwTfidf(text)
keywords

[('马化腾', 2.1735940914363634),
 ('五虎', 1.0377973638363636),
 ('腾讯', 0.8269687824763636),
 ('榜单', 0.8075816195600001),
 ('套现', 0.7224940288809091),
 ('持股', 0.5777534456690909),
 ('不到', 0.48511084255363635),
 ('股份', 0.4397362419445454),
 ('数据', 0.43466878894454547),
 ('基金', 0.3660047858290909)]

## 聚类算法

In [16]:
from sklearn.cluster import KMeans
#文本聚类句向量，然后再通过kmeans等方法进行聚类
def getkmeans(corpus):
    corpus_embeddings = embedder.encode(corpus)
    print("len of emb is =====", len(corpus_embeddings))
    num_clusters = 3
    clustering_model = KMeans(n_clusters=num_clusters)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_

    clustered_sentences = [[] for i in range(num_clusters)]

    for sentence_id, cluster_id in enumerate(cluster_assignment):
        clustered_sentences[cluster_id].append(corpus[sentence_id])

    return clustered_sentences
corpus = ['网友们都清楚，中国互联网行业之所以能媲美美国互联网行业，一方面是得益于中国互联网在商业模式上的创新，另一方面则是受益于资本的驱动。',
'阿里巴巴这家公司早年也差钱，马云的阿里获得了孙正义软银的2000万美元投资。自此阿里巴巴的B2B业务开始进军国际市场，并在国际市场上打出了名气。',
'同样的，腾讯公司在早期也是获得了资本的支持，才取得了快速发展。腾讯的大股东是南非报业集团。',
'大股东Naspers是一家1915年成立，总部位于南非的传媒集团。2001年购买了腾讯的股份，随着腾讯市值飙涨，Naspers成为整个非洲市值最大的上市公司，其腾讯的股权市值甚至超过了自身业务的市值。',
'从以上数据可以看出，马化腾加上马化腾基金的股份，也持股不到9%，像腾讯早期的五虎，大多也套现不少，并没有出现在10大股东的榜单之中。',]
cluster=getkmeans(corpus)
cluster


len of emb is ===== 5


[['同样的，腾讯公司在早期也是获得了资本的支持，才取得了快速发展。腾讯的大股东是南非报业集团。',
  '大股东Naspers是一家1915年成立，总部位于南非的传媒集团。2001年购买了腾讯的股份，随着腾讯市值飙涨，Naspers成为整个非洲市值最大的上市公司，其腾讯的股权市值甚至超过了自身业务的市值。',
  '从以上数据可以看出，马化腾加上马化腾基金的股份，也持股不到9%，像腾讯早期的五虎，大多也套现不少，并没有出现在10大股东的榜单之中。'],
 ['阿里巴巴这家公司早年也差钱，马云的阿里获得了孙正义软银的2000万美元投资。自此阿里巴巴的B2B业务开始进军国际市场，并在国际市场上打出了名气。'],
 ['网友们都清楚，中国互联网行业之所以能媲美美国互联网行业，一方面是得益于中国互联网在商业模式上的创新，另一方面则是受益于资本的驱动。']]

In [20]:
# singlePass算法，适合较大语料场景
import numpy as np
import math
import jieba
import json
from gensim import corpora, models, similarities, matutils
from smart_open import  smart_open
import pandas as pd
from  textrank4zh import TextRank4Keyword,TextRank4Sentence #关键词和关键句提取
from tkinter import _flatten   #用于将嵌套列表压成一层
from sentence_transformers import SentenceTransformer
import scipy

class Single_Pass_Cluster(object):
    def __init__(self,
                 filename,
                 corpus,
                 stop_words_file= '../data/dict/Stopword.txt',
                 theta = 0.5):

        self.filename = filename
        self.stop_words_file = stop_words_file
        self.theta = theta
        self.corpus = corpus

    #'''以列表的形式读取文档'''
    def loadData(self,filename):

        Data = []
        i = 0
        if filename==None:
            Data= None
        else:
            
            with smart_open(self.filename,encoding='utf-8') as f:
                tick=f.readlines()[1:10001]
                #鉴于有些文档较长，包含多个语义中心，因此按语句结束标点进行切割获取表意单一的句子产生的聚类效果会更好
                texts = [cut_sentences(i) for i in tick]
                print('未切割前的语句总数有{}条...'.format(len(texts)))
                print ("............................................................................................")
                texts = [i.strip() for i in list(_flatten(texts)) if len(i)>5]
                print('切割后的语句总数有{}条...'.format(len(texts)))
                for line in texts:
                    i  += 1
                    Data.append(line )
        return Data

    def word_segment(self,texts):
    #'''对语句进行分词，并去掉常见无意义的高频词（停用词）'''
        stopwords = [line.strip() for line in open( self.stop_words_file,encoding='utf-8').readlines()]
        word_segmentation = []
        words = jieba.cut(texts)
        for word in words:
            if word == ' ':
                continue
            if word not in stopwords and word.isdigit()==False:
                word_segmentation.append(word)
        return word_segmentation

    def get_Tfidf_vector_representation(self,word_segmentation,pivot= 10, slope = 0.1):
        #'''采用VSM(vector space model)得到文档的空间向量表示，也可以doc2vec等算法直接获取句向量'''

        dictionary = corpora.Dictionary(word_segmentation)  #获取分词后词汇和词汇id的映射关系，形成字典
        corpus = [dictionary.doc2bow(text) for text in word_segmentation]   #得到语句的向量表示
        tfidf = models.TfidfModel(corpus,pivot=pivot, slope =slope)      #进一步获取语句的TF-IDF向量表示
        corpus_tfidf = tfidf[corpus]
        return corpus_tfidf

    def get_bert_vector(self,word_segmentation):
        embedder = SentenceTransformer('hfl/chinese-electra-180g-small-generator')
        corpus_embeddings=embedder.encode(word_segmentation)
        return corpus_embeddings

    def getMaxSimilarity(self,dictTopic, vector):
     #  '''计算新进入文档和已有文档的文本相似度，这里的相似度采用的是cosine余弦相似度，大家还可以试试
    #kullback_leibler, jaccard, hellinger等相似度计算方法'''

        maxValue = 0
        maxIndex = -1
        for k,cluster in dictTopic.items():
            oneSimilarity = np.mean([matutils.cossim(vector, v) for v in cluster])
            if oneSimilarity > maxValue:
                maxValue = oneSimilarity
                maxIndex = k
        return maxIndex, maxValue

    def gettopKsim(query_embedding, sentence_embeddings):
        number_top_matches = 3
        senindexrs = []
        topkrs = {}
        distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]
        results = zip(range(1, len(distances) + 1), distances)
        results = sorted(results, key=lambda x: x[1])
        # print("Query:", query)
        # print("\nTop {} most similar sentences in corpus:".format(number_top_matches))

        for idx, distance in results[0:number_top_matches]:
            # print("index",idx, "(Cosine Score: %.4f)" % (1 - distance))
            topkrs[idx] = 1 - distance
            senindexrs.append(idx)
        print("候选答案：", topkrs)
        return topkrs



    def getMaxSimbert(self,dictTopic, vector):
     #  '''计算新进入文档和已有文档的文本相似度，这里的相似度采用的是cosine余弦相似度，根据上一篇文章的提示，大家还可以试试
    #kullback_leibler, jaccard, hellinger等相似度计算方法'''

        maxValue = 0
        maxIndex = -1
        for k,cluster in dictTopic.items():
            Similaritys =scipy.spatial.distance.cdist([vector], cluster, "cosine")[0]

            results = sorted(Similaritys)
            oneSimilarity=1-results[0]
            if oneSimilarity > maxValue:
                maxValue = oneSimilarity
                maxIndex = k
        return maxIndex, maxValue

    def single_pass(self,corpus,texts,theta):
        dictTopic = {}
        clusterTopic = {}
        numTopic = 0
        cnt = 0
        for vector,text in zip(corpus,texts):
            if numTopic == 0:
                dictTopic[numTopic] = []
                dictTopic[numTopic].append(vector)
                clusterTopic[numTopic] = []
                clusterTopic[numTopic].append(text)
                numTopic += 1
            else:
                #maxIndex, maxValue = self.getMaxSimilarity(dictTopic, vector)
                maxIndex, maxValue = self.getMaxSimbert(dictTopic, vector)
                # 以第一篇文档为种子，建立一个主题，将给定语句分配到现有的、最相似的主题中
                if maxValue > theta:
                    dictTopic[maxIndex].append(vector)
                    clusterTopic[maxIndex].append(text)

                # 或者创建一个新的主题
                else:
                    dictTopic[numTopic] = []
                    dictTopic[numTopic].append(vector)
                    clusterTopic[numTopic] = []
                    clusterTopic[numTopic].append(text)
                    numTopic += 1
            cnt += 1
            if cnt % 1000 == 0:
                print ("processing {}...".format(cnt))
        return dictTopic, clusterTopic

    def fit_transform(self,theta=0.5):

     # '''综合上述的函数，得出最终的聚类结果：包括聚类的标号、每个聚类的数量、关键主题词和关键语句'''
        datMat = self.loadData(self.filename)
        if datMat==None:
            datMat=self.corpus
        word_segmentation = []
        for i in range(len(datMat)):
            wdlist=self.word_segment(datMat[i])
            word_segmentation.append("".join(wdlist))

            #word_segmentation.append(wdlist)
        print ("............................................................................................")
        print('文本已经分词完毕 !')

        #得到文本数据的空间向量表示
        #corpus_tfidf = self.get_Tfidf_vector_representation(word_segmentation)
        corpus_bert = self.get_bert_vector(word_segmentation)
        dictTopic, clusterTopic = self.single_pass(corpus_bert, datMat, theta)
        print ("............................................................................................")
        print( "得到的主题数量有: {} 个 ...".format(len(dictTopic)))
        print ("............................................................................................\n")
        #按聚类语句数量对聚类结果进行降序排列，找到重要的聚类群
        clusterTopic_list = sorted(clusterTopic.items(),key=lambda x: len(x[1]),reverse=True)
        # f=open("../data/cluster_topic.txt","w",encoding="utf-8")
        for k in clusterTopic_list:
            cluster_title = '\n'.join(k[1])
            # 得到每个聚类中的主题关键词
            word = TextRank4Keyword()
            word.analyze(''.join(self.word_segment(''.join(cluster_title))),window = 5,lower = True)
            w_list = word.get_keywords(num = 20,word_min_len = 2)
           # 得到每个聚类中的关键主题句TOP3
            sentence = TextRank4Sentence()
            sentence.analyze(' '.join(k[1]) ,lower = True)
            s_list = sentence.get_key_sentences(num = 3,sentence_min_len = 3)
            clustcenter=','.join([i.word for i in w_list])+'||'.join([i.sentence for i in s_list])
            wrcontent="【主题索引】:{} \n【主题语量】：{} \n【主题关键词】：{} \n【主题中心句】 ：\n{}".format(k[0], len(k[1]),
                                                                              ','.join([i.word for i in w_list]),
                                                                              '\n'.join([i.sentence for i in s_list]))
            # f.write(wrcontent+"\n")
            print("【主题索引】:{} \n【主题语量】：{} \n【主题关键词】：{} \n【主题中心句】 ：\n{}".format(k[0], len(k[1]),
                                                                              ','.join([i.word for i in w_list]),
                                                                              '\n'.join([i.sentence for i in s_list])))
            print("-------------------------------------------------------------------------")


if __name__ == '__main__':
    corpus = ['网友们都清楚，中国互联网行业之所以能媲美美国互联网行业，一方面是得益于中国互联网在商业模式上的创新，另一方面则是受益于资本的驱动。',
'阿里巴巴这家公司早年也差钱，马云的阿里获得了孙正义软银的2000万美元投资。自此阿里巴巴的B2B业务开始进军国际市场，并在国际市场上打出了名气。',
'同样的，腾讯公司在早期也是获得了资本的支持，才取得了快速发展。腾讯的大股东是南非报业集团。',
'大股东Naspers是一家1915年成立，总部位于南非的传媒集团。2001年购买了腾讯的股份，随着腾讯市值飙涨，Naspers成为整个非洲市值最大的上市公司，其腾讯的股权市值甚至超过了自身业务的市值。',
'从以上数据可以看出，马化腾加上马化腾基金的股份，也持股不到9%，像腾讯早期的五虎，大多也套现不少，并没有出现在10大股东的榜单之中。',]
    single_pass_cluster = Single_Pass_Cluster(filename=None,corpus=corpus, stop_words_file='../data/dict/Stopword.txt')
    single_pass_cluster.fit_transform(theta=0.85)

............................................................................................
文本已经分词完毕 !


No sentence-transformers model found with name /Users/zhangguoqiang/.cache/torch/sentence_transformers/hfl_chinese-electra-180g-small-generator. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/zhangguoqiang/.cache/torch/sentence_transformers/hfl_chinese-electra-180g-small-generator were not used when initializing ElectraModel: ['generator_predictions.LayerNorm.bias', 'generator_lm_head.weight', 'generator_predictions.LayerNorm.weight', 'generator_predictions.dense.weight', 'generator_predictions.dense.bias', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Be

............................................................................................
得到的主题数量有: 1 个 ...
............................................................................................

【主题索引】:0 
【主题语量】：5 
【主题关键词】：腾讯,股份,早期,资本,业务,行业,市值,南非,naspers,商业模式,得益于,不到,公司,互联网,创新,上市公司,传媒,持股,孙正义,差钱 
【主题中心句】 ：
2001年购买了腾讯的股份，随着腾讯市值飙涨，Naspers成为整个非洲市值最大的上市公司，其腾讯的股权市值甚至超过了自身业务的市值
从以上数据可以看出，马化腾加上马化腾基金的股份，也持股不到9%，像腾讯早期的五虎，大多也套现不少，并没有出现在10大股东的榜单之中
同样的，腾讯公司在早期也是获得了资本的支持，才取得了快速发展
-------------------------------------------------------------------------


## 分类算法（bert+finetune）

In [21]:
# 调用的bert模型名称；
# 构造训练数据集
# 构造input token，mask等处理
# 超参数调整优化max_length，batch_size，epochs，lr
import os
import random
import numpy as np
from transformers import WEIGHTS_NAME, CONFIG_NAME
import torch
import pandas as pd
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, random_split
from transformers import BertForSequenceClassification, AdamW, BertConfig
import time,datetime
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    # 返回 hh:mm:ss 形式的时间
    return str(datetime.timedelta(seconds=elapsed_rounded))


print('Download BERT tokenizer...')

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True)

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()

    print('There are %d GPU(s) available.' % n_gpu)

    print('We will use the GPU:', [torch.cuda.get_device_name(i) for i in range(n_gpu)])

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

train_data = pd.read_csv("../data/bert_train_data/train.csv", sep='\t')
desclist=train_data['desc'].unique().tolist()
train_data['label']=train_data['desc'].apply(lambda x : desclist.index(x))
test_data = pd.read_csv("../data/bert_train_data/test.csv", sep='\t')

# 循环每一个句子...
sentences = train_data['content'].tolist()
input_ids = []
attention_masks = []

for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
        sent,  # Sentence to encode.
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=256,  # Pad & truncate all sentences.
        pad_to_max_length=True,
        return_attention_mask=True,  # Construct attn. masks.
        return_tensors='pt',  # Return pytorch tensors.
    )

    # 把编码的句子加入list.
    input_ids.append(encoded_dict['input_ids'])

    # 加上 attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# 把lists 转为 tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = train_data['label'].values
labels = torch.tensor(labels)


output_dir = "../data/bert_train_data/models/"
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
# 代码参考 https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# 设置随机种子.

# 把input 放入 TensorDataset。
dataset = TensorDataset(input_ids, attention_masks, labels)

# 计算 train_size 和 val_size 的长度.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# 90% 的dataset 为train_dataset, 10% 的的dataset 为val_dataset.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} 训练数据'.format(train_size))
print('{:>5,} 验证数据'.format(val_size))

# 推荐batch_size 为 16 或者 32
batch_size = 16

# 为训练数据集和验证数据集设计DataLoaders.
train_dataloader = DataLoader(
            train_dataset,  # 训练数据.
            sampler = RandomSampler(train_dataset), # 打乱顺序
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset, # 验证数据.
            sampler = RandomSampler(val_dataset), # 打乱顺序
            batch_size = batch_size
        )


model = BertForSequenceClassification.from_pretrained(
    "bert-base-chinese", # 使用 12-layer 的 BERT 模型.
    num_labels = 12, # 多分类任务的输出标签为 12个.
    output_attentions = False, # 不返回 attentions weights.
    output_hidden_states = False, # 不返回 all hidden-states.
)
model.cpu()

# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

# AdamW 是一个 huggingface library 的类，'W' 是'Weight Decay fix"的意思。
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - 默认是 5e-5
                  eps = 1e-8 # args.adam_epsilon  - 默认是 1e-8， 是为了防止衰减率分母除到0
                )

# bert 推荐 epochs 在2到4之间为好。
epochs = 2

# training steps 的数量: [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs

# 设计 learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

seed_val = 2021
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
# torch.manual_seed_all(seed_val)

# 记录training ,validation loss ,validation accuracy and timings.
training_stats = []

# 设置总时间.
total_t0 = time.time()
best_val_accuracy = 0

for epoch_i in range(0, epochs):
    print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))

    # 记录每个 epoch 所用的时间
    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        # 每隔40个batch 输出一下所用时间.
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # `batch` 包括3个 tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # 清空梯度
        model.zero_grad()

        # forward
        # 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        pred = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        logits=pred.logits
        loss=pred.loss
        #print(loss, logits)

        total_train_loss += loss.item()

        # backward 更新 gradients.
        loss.backward()

        # 减去大于1 的梯度，将其设为 1.0, 以防梯度爆炸.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 更新模型参数
        optimizer.step()

        # 更新 learning rate.
        scheduler.step()

        logit = logits.detach().cpu().numpy()
        label_id = b_labels.to('cpu').numpy()
        # 计算training 句子的准确度.
        total_train_accuracy += flat_accuracy(logit, label_id)

        # 计算batches的平均损失.
    avg_train_loss = total_train_loss / len(train_dataloader)
    # 计算训练时间.
    training_time = format_time(time.time() - t0)

    # 训练集的准确率.
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    print("训练准确率: {0:.2f}".format(avg_train_accuracy))
    print("平均训练损失 loss: {0:.2f}".format(avg_train_loss))
    print("训练时间: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================

    t0 = time.time()

    # 设置 model 为valuation 状态，在valuation状态 dropout layers 的dropout rate会不同
    model.eval()

    # 设置参数
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        # `batch` 包括3个 tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # 在valuation 状态，不更新权值，不改变计算图
        with torch.no_grad():
            # 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            pred = model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            logits = pred.logits
            loss = pred.loss

        # 计算 validation loss.
        total_eval_loss += loss.item()
        logit = logits.detach().cpu().numpy()
        label_id = b_labels.to('cpu').numpy()

        # 计算 validation 句子的准确度.
        total_eval_accuracy += flat_accuracy(logit, label_id)

    # 计算 validation 的准确率.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("")
    print("测试准确率: {0:.2f}".format(avg_val_accuracy))

    if avg_val_accuracy > best_val_accuracy:
        best_val_accuracy = avg_val_accuracy
        torch.save(model.state_dict(), output_model_file)
        model.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(output_dir)

    # 计算batches的平均损失.
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # 计算validation 时间.
    validation_time = format_time(time.time() - t0)

    print("平均测试损失 Loss: {0:.2f}".format(avg_val_loss))
    print("测试时间: {:}".format(validation_time))

    # 记录模型参数
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("训练一共用了 {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))

Download BERT tokenizer...


Downloading: 100%|██████████| 110k/110k [00:00<00:00, 199kB/s] 
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 6.14kB/s]
Downloading: 100%|██████████| 624/624 [00:00<00:00, 159kB/s]
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


No GPU available, using the CPU instead.
    9 训练数据
    1 验证数据


Downloading: 100%|██████████| 412M/412M [00:48<00:00, 8.40MB/s] 
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenc

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (21128, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [22]:
import torch
from transformers import BertModel, BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments
import transformers
import time,datetime

loadstart=time.time()
# 读取模型对应的tokenizer
tokenizer = BertTokenizer.from_pretrained('../data/bert_train_data/models')
# 载入模型
model = BertForSequenceClassification.from_pretrained('../data/bert_train_data/models')
model = model.cpu()
classifier = transformers.pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
loadend=time.time()
print("load time is ====",loadend-loadstart)
classstart=time.time()
texttest=''
result=classifier(texttest)
label2name={"0":"空间","1":"外观"}
labelval=result[0].get("label")
lab=labelval.split("_")[1]
rs=label2name.get(lab)
classend=time.time()
print(rs,"classify time is====",classend-classstart)

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '../data/bert_train_data/models'. Use `repo_type` argument if needed.