In [None]:
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
import os

class ShortTopicDetect():
    def __init__(self, embed_model):
        super().__init__()
        self.embedding = AutoModel.from_pretrained(embed_model)
        self.tokenizer = AutoTokenizer.from_pretrained(embed_model)

    def forward(self, sentences):
        inputs = self.tokenizer(
            sentences,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=256
        )

        with torch.no_grad():
            outputs = self.embedding(**inputs)

        token_embed = outputs.last_hidden_state      # (Batch, T, H)
        attention_mask = inputs['attention_mask']    # (Batch, T)

        mask = attention_mask.unsqueeze(-1).expand(token_embed.size()).float()
        sentence_embed = (token_embed * mask).sum(dim=1) / mask.sum(dim=1)
        sentence_embed = F.normalize(sentence_embed, p=2, dim=1)

        return sentence_embed

model = ShortTopicDetect(embed_model=r'D:\Projects\nlp-public-opinion-analysis\src\models\text2vec-base-chinese')

In [2]:
from sentence_transformers import SentenceTransformer

import torch
import torch.nn.functional as F
import os

class ShortTopicDetect():
    def __init__(self, embed_model):
        super().__init__()
        self.embed_model = embed_model
        self.embedding = AutoModel.from_pretrained(embed_model)
        self.tokenizer = AutoTokenizer.from_pretrained(embed_model)

    def forward(self, sentences):
        model = SentenceTransformer(self.embed_model)
        embeddings = model.encode(sentences, normalize_embeddings=True)

        return embeddings

model = ShortTopicDetect(embed_model=r'D:\Projects\nlp-public-opinion-analysis\src\models\text2vec-base-chinese')

ImportError: cannot import name 'PreTrainedModel' from 'transformers' (d:\Programs\miniconda3\envs\py310\lib\site-packages\transformers\__init__.py)

In [None]:
# Demo to test ShortTopicDetect class

sentences = ['今天的天气真好', '今天天气不错']
outputs = model.forward(sentences)
print('Embedding outputs:\n', outputs)
print('\n', outputs.shape)
print(outputs[0] @ outputs[1])  # Cosine similarity between two sentence embeddings