In [1]:
import torch
from torchtext.vocab import Vectors

from torchtext.vocab import GloVe


In [2]:
import spacy
from spacy.tokens import Span

# 加载Spacy模型
nlp = spacy.load("en_core_web_sm")

# 提取的句子
sentences = [
    "In the shop, these MacBooks are encased in a soft rubber enclosure - so you will never know about the razor edge until you buy it, get it home, break the seal and use it (very clever con).",
    "This laptop meets every expectation and Windows 7 is great!"
]
labels = [1, 1]

# 定义一个函数返回基于句法依存树的句子分割
def syntactic_span_segmentation(doc):
    spans = []
    for token in doc:
        # 找到谓语动词及其子树，作为一个跨度
        if token.dep_ == 'ROOT':
            subtree = list(token.subtree)
            if len(subtree) <= 5:
                spans.append((subtree[0].i, subtree[-1].i))
        # 找到所有的名词短语及其子树，作为一个跨度
        if token.dep_ in ('nsubj', 'dobj', 'pobj', 'attr'):
            subtree = list(token.subtree)
            if len(subtree) <= 5:
                spans.append((subtree[0].i, subtree[-1].i))
        # 找到所有的形容词及其子树，作为一个跨度
        if token.pos_ == 'ADJ':
            subtree = list(token.subtree)
            if len(subtree) <= 5:
                spans.append((subtree[0].i, subtree[-1].i))
    return spans

# 生成基于句法依存树的跨度列表
spans_list = []
for sentence in sentences:
    doc = nlp(sentence)
    spans = syntactic_span_segmentation(doc)
    spans_list.append(spans)

# 输出结果
for spans in spans_list:
    print(spans)
    
#spans_list

[(1, 2), (10, 10), (9, 12), (15, 15), (20, 22), (24, 24), (26, 26), (29, 29), (33, 34), (37, 37), (39, 40)]
[(0, 1), (3, 7), (9, 9)]


In [3]:

def spans_to_words(doc, spans):
    words = []
    for start, end in spans:
        span_words = doc[start:end+1]
        words.append(span_words.text)
    return words


# 转为 单词列表

   

words_list = []
spans_list = []
for sentence in sentences:
    doc = nlp(sentence)
    spans = syntactic_span_segmentation(doc)
#     spans_list.append(spans)
    words = spans_to_words(doc, spans)
    words_list.append(words)


for words in words_list:
    print(words)
    


['the shop', 'soft', 'a soft rubber enclosure', 'you', 'the razor edge', 'you', 'it', 'it', 'the seal', 'it', 'very clever']
['This laptop', 'every expectation and Windows 7', 'great']


In [4]:
import numpy as np
# 加载本地GloVe词向量
glove_path = "/Users/bootscoder/PycharmProjects/nlp-lesson-design/test3/.vector_cache/glove.6B.100d.txt"
glove = Vectors(name=glove_path)

# 定义一个函数，将单词列表转换为GloVe向量
def words_to_glove_vectors(words, glove):
    vectors = []
    for word in words:
        # 处理每个短语中的每个单词
        word_vectors = []
        for token in word.split():
            if token in glove.stoi:
                word_vectors.append(glove[token].numpy())
            else:
                # 如果单词不在GloVe词汇中，使用一个全零向量
                word_vectors.append(np.zeros(glove.dim))
        if word_vectors:
            # 计算单词向量的平均值作为短语的向量
            vectors.append(np.mean(word_vectors, axis=0))
        else:
            vectors.append(np.zeros(glove.dim))
    return vectors

# 将短语列表转换为GloVe向量
glove_vectors_list = []
for words in words_list:
    glove_vectors = words_to_glove_vectors(words, glove)
    glove_vectors_list.append(glove_vectors)

# 输出结果
for vectors in glove_vectors_list:
    print(vectors)
    print(len(vectors))
    

[array([ 0.13303299, -0.19339001, -0.03462997, -0.377225  ,  0.193251  ,
        0.2404265 , -0.245058  ,  0.5126    ,  0.05908501,  0.1363695 ,
        0.197175  , -0.2597    ,  0.59237003, -0.07484999, -0.208725  ,
        0.19986999,  0.304955  , -0.05074999, -0.517175  , -0.15766   ,
        0.291545  ,  0.02313501,  0.280725  , -0.5897795 ,  0.41077   ,
       -0.280316  , -0.180695  , -0.62486   , -0.1988    , -0.106544  ,
       -0.214706  ,  0.403705  ,  0.17855   ,  0.1614835 , -0.1547749 ,
        0.3818615 , -0.065335  , -0.04322499,  0.497345  , -0.76368904,
        0.07089502, -0.43927002,  0.095089  , -0.47281998,  0.215482  ,
        0.36106   , -0.03019002,  0.075195  ,  0.125735  , -0.45094997,
       -0.12121099,  0.1353645 , -0.08802   ,  0.79042995, -0.61651   ,
       -2.37015   , -0.65323   , -0.1396265 ,  1.78895   ,  0.47207502,
       -0.018265  ,  0.53671503,  0.163517  ,  0.519305  ,  0.43568   ,
       -0.168345  ,  0.5909125 ,  0.03488   ,  0.0089165 , -0.2

In [5]:
# 去重并保持顺序的函数
def remove_duplicates(phrases):
    seen = set()
    unique_phrases = []
    for phrase in phrases:
        if phrase not in seen:
            unique_phrases.append(phrase)
            seen.add(phrase)
    return unique_phrases

# 将短语列表转换为GloVe向量
glove_vectors_list = []
for words in words_list:
    words = remove_duplicates(words)
    glove_vectors = words_to_glove_vectors(words, glove)
    glove_vectors_list.append(glove_vectors)

# 输出结果
# for vectors in glove_vectors_list:
#     print(vectors)
#     print(len(vectors))
glove_vectors_list

[[array([ 0.13303299, -0.19339001, -0.03462997, -0.377225  ,  0.193251  ,
          0.2404265 , -0.245058  ,  0.5126    ,  0.05908501,  0.1363695 ,
          0.197175  , -0.2597    ,  0.59237003, -0.07484999, -0.208725  ,
          0.19986999,  0.304955  , -0.05074999, -0.517175  , -0.15766   ,
          0.291545  ,  0.02313501,  0.280725  , -0.5897795 ,  0.41077   ,
         -0.280316  , -0.180695  , -0.62486   , -0.1988    , -0.106544  ,
         -0.214706  ,  0.403705  ,  0.17855   ,  0.1614835 , -0.1547749 ,
          0.3818615 , -0.065335  , -0.04322499,  0.497345  , -0.76368904,
          0.07089502, -0.43927002,  0.095089  , -0.47281998,  0.215482  ,
          0.36106   , -0.03019002,  0.075195  ,  0.125735  , -0.45094997,
         -0.12121099,  0.1353645 , -0.08802   ,  0.79042995, -0.61651   ,
         -2.37015   , -0.65323   , -0.1396265 ,  1.78895   ,  0.47207502,
         -0.018265  ,  0.53671503,  0.163517  ,  0.519305  ,  0.43568   ,
         -0.168345  ,  0.5909125 ,  0.

In [6]:
import torch
import torch.nn.functional as F

# 假设 glove_vectors_list 已经准备好，格式如下：
glove_vectors_list = [torch.rand(5, 100), torch.rand(6, 100)]
labels = [1, 1]

# Padding sequences to have the same length
padded_sequences = torch.nn.utils.rnn.pad_sequence(glove_vectors_list, batch_first=True)

# 加载GloVe向量
def load_glove_vectors(file_path):
    glove_vectors = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(x) for x in values[1:]], dtype=torch.float32)
            glove_vectors[word] = vector
    return glove_vectors

# 假设GloVe文件路径为'glove.6B.100d.txt'
glove_vectors = load_glove_vectors('/Users/bootscoder/PycharmProjects/nlp-lesson-design/test3/.vector_cache/glove.6B.100d.txt')

# 获取情感标签的GloVe向量
positive_vector = glove_vectors['positive']
negative_vector = glove_vectors['negative']
neutral_vector = glove_vectors['neutral']

# 修改标签为GloVe向量
labels_glove = torch.stack([positive_vector if label == 1 else neutral_vector if label == 0 else negative_vector for label in labels])

# 定义一个函数来找到与情感分类结果最相关的向量
def find_most_relevant_vectors(glove_vectors_list, labels_glove):
    relevant_vectors = []
    for i, sequence in enumerate(glove_vectors_list):
        label_vector = labels_glove[i]
        # 计算每个向量与标签向量的余弦相似度
        attention_scores = F.cosine_similarity(sequence, label_vector.unsqueeze(0), dim=-1)
        most_relevant_index = torch.argmax(attention_scores).item()
        second_most_relevant_index = torch.topk(attention_scores, 2).indices[-1].item()
        label_str = 'POS' if torch.equal(label_vector, positive_vector) else 'NEU' if torch.equal(label_vector, neutral_vector) else 'NEG'
        relevant_vectors.append(([most_relevant_index], [second_most_relevant_index], label_str))
    return relevant_vectors

# 找到最相关的向量
relevant_vectors = find_most_relevant_vectors(glove_vectors_list, labels_glove)

print(relevant_vectors)

[([3], [4], 'POS'), ([1], [5], 'POS')]
