In [1]:
import os
import mmap
import collections
import numpy as np
import random


# 数据预处理
# 读取数据
data_dir = "../dataset"
text = []
with open(os.path.join(data_dir, "text8"), "r") as f:
    # 创建内存映射
    mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
    text = mmapped_file.read().decode('utf-8')
    mmapped_file.close()


# 字符太多，只取前1000
print("text长度为: " + str(len(text)))
text = text[:5000]
print(text)
print(collections.Counter(text).most_common(20 - 1))
count = [['UNK', -1]]
print(text.split())
words = text.split()
print(collections.Counter(words))
count.extend(collections.Counter(words))
print(count)

text长度为: 713069767
 anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive

In [2]:
# 构造数据集
def build_dataset(words, vocab_size):
    count = [['UNK', -1]]
    # 选出最常见的vocab_size - 1个单词
    # count.extend(collections.Counter(words).most_common(vocab_size - 1))
    count.extend(collections.Counter(words).most_common())
    print("统计单词数量：", count)
    dictionary = {word: index for index, (word, _) in enumerate(count)}
    # 得到每个单词的索引
    data = [dictionary.get(word, 0) for word in words]
    count[0][1] = sum([1 for word in words if word not in dictionary])
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

words = text.split() # 得到text中的每个单词
vocab_size = len(words)
data, count, dictionary, reverse_dictionary = build_dataset(words, vocab_size)

del words  # 释放内存

统计单词数量： [['UNK', -1], ('the', 49), ('of', 29), ('and', 22), ('in', 21), ('as', 14), ('to', 14), ('is', 12), ('that', 12), ('anarchism', 9), ('a', 9), ('anarchist', 9), ('society', 7), ('what', 7), ('this', 7), ('an', 7), ('proudhon', 7), ('it', 6), ('anarchists', 6), ('are', 6), ('be', 6), ('property', 6), ('term', 5), ('revolution', 5), ('by', 5), ('use', 5), ('one', 5), ('he', 5), ('first', 4), ('french', 4), ('means', 4), ('also', 4), ('self', 4), ('from', 4), ('anarchy', 4), ('or', 4), ('they', 4), ('with', 4), ('kropotkin', 4), ('was', 4), ('were', 4), ('godwin', 4), ('modern', 4), ('his', 4), ('at', 4), ('used', 3), ('working', 3), ('word', 3), ('political', 3), ('authoritarian', 3), ('state', 3), ('most', 3), ('individuals', 3), ('free', 3), ('ideas', 3), ('about', 3), ('law', 3), ('movement', 3), ('where', 3), ('which', 3), ('no', 3), ('have', 3), ('labor', 3), ('would', 3), ('abuse', 2), ('against', 2), ('early', 2), ('class', 2), ('including', 2), ('diggers', 2), ('english', 

In [3]:
# 生成训练数据 采用skip-gram模型
def generate_batch(data, batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer[:] = data[:span]
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

data_index = 0
batch, labels = generate_batch(data, batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

129 originated -> 5 as
129 originated -> 9 anarchism
5 as -> 10 a
5 as -> 129 originated
10 a -> 5 as
10 a -> 22 term
22 term -> 2 of
22 term -> 10 a


In [4]:
# 计算余弦相似度
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)


# 最近邻查询
def get_nearest_neighbors(embedding_matrix, word_id, reverse_dictionary, top_k=10):
    word_vector = embedding_matrix[word_id]
    similarities = []
    for idx, vector in enumerate(embedding_matrix):
        if idx != word_id:
            similarity = cosine_similarity(word_vector, vector)
            similarities.append((idx, similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    neighbors = [(reverse_dictionary[idx], similarity) for idx, similarity in similarities[:top_k]]
    return neighbors

def analogy_test(embedding_matrix, word1, word2, word3, dictionary, reverse_dictionary):
    vec1 = embedding_matrix[dictionary[word1]]
    vec2 = embedding_matrix[dictionary[word2]]
    vec3 = embedding_matrix[dictionary[word3]]
    target_vector = vec2 - vec1 + vec3

    similarities = []
    for idx, vector in enumerate(embedding_matrix):
        if idx not in [dictionary[word1], dictionary[word2], dictionary[word3]]:
            similarity = cosine_similarity(target_vector, vector)
            similarities.append((idx, similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return reverse_dictionary[similarities[0][0]]


In [5]:
import numpy as np
from tqdm import tqdm

class GloVe:
    def __init__(self, vocab_size, vector_dim, learning_rate, x_max=100, alpha=0.75):
        self.vocab_size = vocab_size
        self.vector_dim = vector_dim
        self.learning_rate = learning_rate
        self.x_max = x_max
        self.alpha = alpha
        self.W = np.random.rand(vocab_size, vector_dim)
        self.bias = np.random.rand(vocab_size)
        self.grad_W = np.zeros_like(self.W)
        self.grad_bias = np.zeros_like(self.bias)

    def train(self, co_occurrence_matrix, epochs):
        for epoch in tqdm(range(epochs)):
            for i in range(self.vocab_size):
                for j in range(self.vocab_size):
                    if co_occurrence_matrix[i][j] > 0:
                        weight = self.weighting_function(co_occurrence_matrix[i][j])
                        diff = np.dot(self.W[i], self.W[j]) + self.bias[i] + self.bias[j] - np.log(co_occurrence_matrix[i][j])
                        self.grad_W[i] += weight * diff * self.W[j]
                        self.grad_W[j] += weight * diff * self.W[i]
                        self.grad_bias[i] += weight * diff
                        self.grad_bias[j] += weight * diff
            self.W -= self.learning_rate * self.grad_W
            self.bias -= self.learning_rate * self.grad_bias
            if (epoch + 1) % 10 == 0:
                print(f'Epoch {epoch + 1} completed')

    def weighting_function(self, x):
        return (x / self.x_max) ** self.alpha if x < self.x_max else 1

    def get_word_vector(self, word_id):
        return self.W[word_id]

vector_dim = 100
learning_rate = 0.00001
epochs = 100
batch_size = 128
num_skips = 2
skip_window = 1
co_occurrence_matrix = np.random.randint(0, 100, size=(vocab_size, vocab_size))

glove = GloVe(vocab_size, vector_dim, learning_rate)
glove.train(co_occurrence_matrix, epochs)

 10%|█         | 10/100 [00:24<03:41,  2.47s/it]

Epoch 10 completed


 20%|██        | 20/100 [00:48<03:15,  2.44s/it]

Epoch 20 completed


 30%|███       | 30/100 [01:13<02:53,  2.48s/it]

Epoch 30 completed


 40%|████      | 40/100 [01:38<02:27,  2.46s/it]

Epoch 40 completed


 50%|█████     | 50/100 [02:03<02:08,  2.58s/it]

Epoch 50 completed


 60%|██████    | 60/100 [02:30<01:48,  2.71s/it]

Epoch 60 completed


 70%|███████   | 70/100 [02:58<01:21,  2.70s/it]

Epoch 70 completed


 80%|████████  | 80/100 [03:25<00:54,  2.74s/it]

Epoch 80 completed


 90%|█████████ | 90/100 [03:53<00:27,  2.72s/it]

Epoch 90 completed


100%|██████████| 100/100 [04:20<00:00,  2.60s/it]

Epoch 100 completed





In [6]:
import numpy as np
import pickle

# 保存和加载预训练词嵌入向量，计算相似度
class WordEmbedding:
    def __init__(self, embedding_matrix):
        self.embedding_matrix = embedding_matrix

    def save_embeddings(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.embedding_matrix, f)

    def load_embeddings(self, filename):
        with open(filename, 'rb') as f:
            self.embedding_matrix = pickle.load(f)

    def get_similarity(self, word_id1, word_id2):
        vec1 = self.embedding_matrix[word_id1]
        vec2 = self.embedding_matrix[word_id2]
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [7]:
# 测试Word2Vec
embedding_matrix = glove.W
we = WordEmbedding(embedding_matrix)
we.save_embeddings("glove_wordembedding.pkl")
we.load_embeddings("glove_wordembedding.pkl")
similarity = we.get_similarity(dictionary["anarchism"], dictionary["originated"])
print(f"The Similarity of pejorative and describe: {similarity}")

The Similarity of pejorative and describe: 0.019961320341522336
