In [1]:
import os
import mmap
import collections
import numpy as np
import random


# 数据预处理
# 读取数据
data_dir = "../dataset"
text = []
with open(os.path.join(data_dir, "text8"), "r") as f:
    # 创建内存映射
    mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
    text = mmapped_file.read().decode('utf-8')
    mmapped_file.close()


# 字符太多，只取前50000
print("text长度为: " + str(len(text)))
text = text[:50000]
print(text)
print(collections.Counter(text).most_common(20 - 1))
count = [['UNK', -1]]
print(text.split())
words = text.split()
print(collections.Counter(words))
count.extend(collections.Counter(words))
print(count)

text长度为: 713069767
 anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive

In [2]:
# 构造数据集
def build_dataset(words, vocab_size):
    count = [['UNK', -1]]
    # 选出最常见的vocab_size - 1个单词
    # count.extend(collections.Counter(words).most_common(vocab_size - 1))
    count.extend(collections.Counter(words).most_common())
    print("统计单词数量：", count)
    dictionary = {word: index for index, (word, _) in enumerate(count)}
    # 得到每个单词的索引
    data = [dictionary.get(word, 0) for word in words]
    count[0][1] = sum([1 for word in words if word not in dictionary])
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

words = text.split() # 得到text中的每个单词
vocab_size = 50000
data, count, dictionary, reverse_dictionary = build_dataset(words, vocab_size)

del words  # 释放内存

统计单词数量： [['UNK', -1], ('the', 460), ('of', 305), ('and', 252), ('in', 201), ('to', 148), ('a', 142), ('one', 117), ('as', 108), ('anarchism', 102), ('is', 89), ('that', 81), ('anarchists', 76), ('anarchist', 69), ('nine', 62), ('s', 59), ('zero', 55), ('by', 46), ('with', 45), ('for', 45), ('was', 42), ('it', 38), ('an', 38), ('eight', 38), ('or', 36), ('be', 34), ('are', 33), ('some', 33), ('anarcho', 33), ('on', 32), ('many', 30), ('such', 28), ('two', 28), ('from', 27), ('this', 27), ('see', 27), ('also', 26), ('most', 26), ('not', 26), ('movement', 26), ('have', 26), ('state', 25), ('which', 24), ('first', 22), ('social', 22), ('they', 22), ('its', 22), ('his', 22), ('autism', 22), ('society', 21), ('three', 21), ('other', 20), ('he', 20), ('six', 19), ('at', 19), ('property', 19), ('has', 18), ('ideas', 18), ('often', 18), ('post', 18), ('movements', 17), ('anti', 17), ('war', 17), ('seven', 17), ('groups', 17), ('term', 16), ('against', 16), ('authoritarian', 16), ('anarchy', 16)

In [3]:
# 生成训练数据 采用skip-gram模型
def generate_batch(data, batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer[:] = data[:span]
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

data_index = 0
batch, labels = generate_batch(data, batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

508 originated -> 9 anarchism
508 originated -> 8 as
8 as -> 6 a
8 as -> 508 originated
6 a -> 8 as
6 a -> 65 term
65 term -> 2 of
65 term -> 6 a


In [4]:
# 计算余弦相似度
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)


# 最近邻查询
def get_nearest_neighbors(embedding_matrix, word_id, reverse_dictionary, top_k=10):
    word_vector = embedding_matrix[word_id]
    similarities = []
    for idx, vector in enumerate(embedding_matrix):
        if idx != word_id and idx in reverse_dictionary:
            similarity = cosine_similarity(word_vector, vector)
            similarities.append((idx, similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    neighbors = [(reverse_dictionary[idx], similarity) for idx, similarity in similarities[:top_k]]
    return neighbors

def analogy_test(embedding_matrix, word1, word2, word3, dictionary, reverse_dictionary):
    vec1 = embedding_matrix[dictionary[word1]]
    vec2 = embedding_matrix[dictionary[word2]]
    vec3 = embedding_matrix[dictionary[word3]]
    target_vector = vec2 - vec1 + vec3

    similarities = []
    for idx, vector in enumerate(embedding_matrix):
        if idx not in [dictionary[word1], dictionary[word2], dictionary[word3]]:
            similarity = cosine_similarity(target_vector, vector)
            similarities.append((idx, similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return reverse_dictionary[similarities[0][0]]


In [5]:
from tqdm import tqdm

# 训练Word2Vec模型
class Word2Vec:
    def __init__(self, vocab_size, vector_dim, learning_rate):
        self.vocab_size = vocab_size
        self.vector_dim = vector_dim
        self.learning_rate = learning_rate
        self.W1 = np.random.rand(vocab_size, vector_dim)
        self.W2 = np.random.rand(vector_dim, vocab_size)

    def train(self, data, epochs, batch_size, num_skips, skip_window, reverse_dictionary, test_words):
        global data_index
        for epoch in tqdm(range(epochs)):
            data_index = 0
            batch_count = len(data) // batch_size
            for step in range(batch_count):
                batch_inputs, batch_labels = generate_batch(data, batch_size, num_skips, skip_window)
                for context_word, target_word in zip(batch_inputs, batch_labels):
                    y_true = np.zeros(self.vocab_size)
                    y_true[target_word] = 1
                    h = self.W1[context_word]
                    u = np.dot(self.W2.T, h)
                    y_pred = self.softmax(u)
                    e = y_pred - y_true
                    self.W1[context_word] -= self.learning_rate * np.dot(self.W2, e)
                    self.W2 -= self.learning_rate * np.outer(h, e)
            if (epoch + 1) % 5 == 0:
                print(f'Epoch {epoch + 1} completed')
                self.evaluate(reverse_dictionary, test_words)

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    def get_word_vector(self, word_id):
        return self.W1[word_id]

    def evaluate(self, reverse_dictionary, test_words):
        print("Evaluating model...")
        for word_id in test_words:
            if word_id not in reverse_dictionary:
                continue  # 跳过不存在的单词ID
            neighbors = get_nearest_neighbors(self.W1, word_id, reverse_dictionary)
            print(f"Neighbors of {reverse_dictionary[word_id]}: Similarity")
            for neighbor, similarity in neighbors:
                print(f"  {neighbor}: {similarity:.4f}")

def generate_test_word_ids(dictionary, num_words=500):
    # 从词汇表的键中随机选择指定数量的单词
    word_ids = list(dictionary.values())
    random.shuffle(word_ids)
    return word_ids[:num_words]


# Example usage
vector_dim = 100
learning_rate = 0.01
epochs = 20
batch_size = 128
num_skips = 2
skip_window = 1

# 生成500个测试单词ID
test_word_ids = generate_test_word_ids(dictionary, num_words=500)

w2v = Word2Vec(vocab_size, vector_dim, learning_rate)
w2v.train(data, epochs, batch_size, num_skips, skip_window, reverse_dictionary, test_word_ids)

 20%|██        | 4/20 [06:11<24:46, 92.90s/it]

Epoch 5 completed
Evaluating model...
Neighbors of capital: Similarity
  oppressive: 0.8416
  legitimacy: 0.8410
  maintain: 0.8320
  advocated: 0.8319
  voting: 0.8314
  using: 0.8302
  archives: 0.8300
  applying: 0.8277
  relevant: 0.8250
  arguing: 0.8237
Neighbors of hop: Similarity
  exponent: 0.8545
  combination: 0.8543
  regarding: 0.8527
  models: 0.8491
  failed: 0.8477
  figureheads: 0.8468
  unity: 0.8464
  modern: 0.8454
  celebrated: 0.8444
  characterised: 0.8436
Neighbors of colonialism: Similarity
  followed: 0.8451
  vary: 0.8416
  unionist: 0.8363
  operation: 0.8353
  detailed: 0.8324
  originally: 0.8301
  feminism: 0.8280
  proceeded: 0.8263
  alternate: 0.8240
  societies: 0.8238
Neighbors of major: Similarity
  seeks: 0.8280
  infantile: 0.8230
  authors: 0.8216
  chomsky: 0.8205
  positive: 0.8204
  cases: 0.8204
  developing: 0.8202
  list: 0.8201
  accept: 0.8200
  conception: 0.8199
Neighbors of defined: Similarity
  he: 0.8593
  rise: 0.8399
  abacus: 0.83

 25%|██▌       | 5/20 [07:49<23:37, 94.52s/it]

Neighbors of exhibits: Similarity
  competition: 0.8503
  riot: 0.8427
  todd: 0.8406
  advocates: 0.8396
  crimethinc: 0.8367
  since: 0.8319
  uprising: 0.8301
  rebellion: 0.8296
  confronting: 0.8284
  with: 0.8275
Neighbors of swiss: Similarity
  owen: 0.8449
  since: 0.8425
  neither: 0.8425
  prior: 0.8368
  symbolism: 0.8366
  earned: 0.8365
  official: 0.8361
  narveson: 0.8347
  scale: 0.8347
  anarchist: 0.8346
Neighbors of conscience: Similarity
  never: 0.8246
  followed: 0.8227
  owned: 0.8224
  guattari: 0.8189
  moniker: 0.8177
  rothbard: 0.8150
  milieu: 0.8143
  oppression: 0.8140
  property: 0.8127
  originally: 0.8119
Neighbors of description: Similarity
  mysogyny: 0.8447
  starhawk: 0.8426
  primary: 0.8412
  syncretic: 0.8401
  community: 0.8382
  concluded: 0.8356
  make: 0.8339
  since: 0.8332
  extensive: 0.8330
  criticizes: 0.8316
Neighbors of johann: Similarity
  hospital: 0.8352
  guattari: 0.8278
  hop: 0.8273
  influential: 0.8254
  polarised: 0.8245
  

 45%|████▌     | 9/20 [14:00<17:05, 93.19s/it]

Epoch 10 completed
Evaluating model...
Neighbors of capital: Similarity
  legitimacy: 0.8433
  oppressive: 0.8409
  maintain: 0.8345
  advocated: 0.8329
  archives: 0.8327
  voting: 0.8312
  using: 0.8301
  applying: 0.8273
  relevant: 0.8256
  leading: 0.8240
Neighbors of hop: Similarity
  regarding: 0.8558
  exponent: 0.8550
  combination: 0.8548
  figureheads: 0.8499
  models: 0.8484
  failed: 0.8472
  celebrated: 0.8444
  modern: 0.8429
  characterised: 0.8429
  plain: 0.8424
Neighbors of colonialism: Similarity
  followed: 0.8440
  vary: 0.8417
  unionist: 0.8365
  operation: 0.8363
  detailed: 0.8316
  originally: 0.8301
  proceeded: 0.8274
  alternate: 0.8240
  societies: 0.8238
  color: 0.8237
Neighbors of major: Similarity
  seeks: 0.8287
  inspired: 0.8190
  conception: 0.8174
  authors: 0.8172
  city: 0.8153
  capital: 0.8147
  positive: 0.8146
  idea: 0.8145
  cases: 0.8144
  accept: 0.8140
Neighbors of defined: Similarity
  he: 0.8527
  union: 0.8406
  abacus: 0.8366
  e: 

 50%|█████     | 10/20 [15:38<15:45, 94.59s/it]

Neighbors of exhibits: Similarity
  competition: 0.8441
  riot: 0.8421
  todd: 0.8385
  crimethinc: 0.8363
  advocates: 0.8353
  since: 0.8331
  rebellion: 0.8310
  uprising: 0.8307
  confronting: 0.8274
  systems: 0.8265
Neighbors of swiss: Similarity
  owen: 0.8466
  neither: 0.8425
  since: 0.8392
  symbolism: 0.8366
  prior: 0.8365
  earned: 0.8361
  narveson: 0.8347
  scale: 0.8347
  disobedience: 0.8333
  hip: 0.8333
Neighbors of conscience: Similarity
  never: 0.8249
  followed: 0.8220
  owned: 0.8182
  moniker: 0.8176
  milieu: 0.8168
  rothbard: 0.8166
  guattari: 0.8162
  being: 0.8133
  originally: 0.8125
  oppression: 0.8120
Neighbors of description: Similarity
  primary: 0.8454
  starhawk: 0.8439
  syncretic: 0.8432
  mysogyny: 0.8411
  make: 0.8359
  concluded: 0.8340
  extensive: 0.8338
  since: 0.8321
  community: 0.8318
  consequently: 0.8315
Neighbors of johann: Similarity
  hospital: 0.8338
  guattari: 0.8280
  thinkers: 0.8261
  detailed: 0.8251
  methodological: 0.

 70%|███████   | 14/20 [21:48<09:18, 93.13s/it]

Epoch 15 completed
Evaluating model...
Neighbors of capital: Similarity
  legitimacy: 0.8416
  oppressive: 0.8374
  maintain: 0.8358
  archives: 0.8334
  voting: 0.8282
  advocated: 0.8278
  applying: 0.8247
  using: 0.8244
  leading: 0.8239
  relevant: 0.8238
Neighbors of hop: Similarity
  regarding: 0.8587
  combination: 0.8550
  exponent: 0.8543
  figureheads: 0.8505
  models: 0.8460
  failed: 0.8453
  celebrated: 0.8444
  since: 0.8441
  plain: 0.8424
  nor: 0.8413
Neighbors of colonialism: Similarity
  followed: 0.8409
  vary: 0.8407
  operation: 0.8360
  unionist: 0.8336
  originally: 0.8301
  detailed: 0.8297
  proceeded: 0.8268
  alternate: 0.8240
  societies: 0.8238
  color: 0.8237
Neighbors of major: Similarity
  seeks: 0.8225
  inspired: 0.8127
  city: 0.8082
  authors: 0.8065
  conception: 0.8064
  capital: 0.8041
  idea: 0.8033
  accept: 0.8027
  debated: 0.8017
  minor: 0.8017
Neighbors of defined: Similarity
  union: 0.8424
  he: 0.8349
  e: 0.8340
  abacus: 0.8333
  nuc

 75%|███████▌  | 15/20 [23:26<07:52, 94.49s/it]

Neighbors of exhibits: Similarity
  riot: 0.8402
  todd: 0.8357
  crimethinc: 0.8348
  competition: 0.8339
  since: 0.8305
  rebellion: 0.8293
  uprising: 0.8291
  advocates: 0.8273
  confronting: 0.8252
  systems: 0.8240
Neighbors of swiss: Similarity
  owen: 0.8471
  neither: 0.8425
  symbolism: 0.8366
  narveson: 0.8347
  scale: 0.8347
  earned: 0.8347
  since: 0.8341
  disobedience: 0.8333
  hip: 0.8333
  societies: 0.8332
Neighbors of conscience: Similarity
  never: 0.8226
  milieu: 0.8185
  followed: 0.8178
  rothbard: 0.8148
  moniker: 0.8146
  originally: 0.8121
  guattari: 0.8121
  owned: 0.8120
  being: 0.8112
  jan: 0.8105
Neighbors of description: Similarity
  primary: 0.8474
  syncretic: 0.8451
  starhawk: 0.8433
  make: 0.8370
  mysogyny: 0.8355
  extensive: 0.8335
  consequently: 0.8311
  list: 0.8305
  concluded: 0.8304
  worldview: 0.8294
Neighbors of johann: Similarity
  hospital: 0.8275
  thinkers: 0.8259
  detailed: 0.8237
  guattari: 0.8233
  methodological: 0.8215

 95%|█████████▌| 19/20 [29:38<01:33, 93.27s/it]

Epoch 20 completed
Evaluating model...
Neighbors of capital: Similarity
  legitimacy: 0.8360
  maintain: 0.8349
  archives: 0.8315
  oppressive: 0.8303
  voting: 0.8221
  applying: 0.8196
  leading: 0.8192
  meanings: 0.8191
  relevant: 0.8190
  advocated: 0.8165
Neighbors of hop: Similarity
  regarding: 0.8610
  combination: 0.8544
  exponent: 0.8525
  figureheads: 0.8497
  since: 0.8445
  celebrated: 0.8444
  models: 0.8426
  failed: 0.8425
  plain: 0.8424
  nor: 0.8413
Neighbors of colonialism: Similarity
  vary: 0.8384
  followed: 0.8364
  operation: 0.8344
  originally: 0.8301
  unionist: 0.8282
  detailed: 0.8265
  proceeded: 0.8247
  alternate: 0.8240
  societies: 0.8238
  color: 0.8237
Neighbors of major: Similarity
  seeks: 0.8135
  inspired: 0.8031
  city: 0.7969
  minor: 0.7966
  authors: 0.7935
  debated: 0.7925
  conception: 0.7914
  idea: 0.7904
  accept: 0.7895
  capital: 0.7883
Neighbors of defined: Similarity
  union: 0.8393
  e: 0.8311
  abacus: 0.8276
  nuclear: 0.82

100%|██████████| 20/20 [31:15<00:00, 93.78s/it]

Neighbors of exhibits: Similarity
  riot: 0.8366
  todd: 0.8318
  crimethinc: 0.8317
  uprising: 0.8248
  rebellion: 0.8243
  since: 0.8241
  confronting: 0.8216
  competition: 0.8201
  systems: 0.8197
  hip: 0.8193
Neighbors of swiss: Similarity
  owen: 0.8467
  neither: 0.8425
  symbolism: 0.8366
  narveson: 0.8347
  scale: 0.8347
  disobedience: 0.8333
  hip: 0.8333
  societies: 0.8332
  culture: 0.8330
  earned: 0.8321
Neighbors of conscience: Similarity
  milieu: 0.8191
  never: 0.8177
  originally: 0.8107
  followed: 0.8101
  rothbard: 0.8095
  moniker: 0.8091
  jan: 0.8081
  escape: 0.8065
  guattari: 0.8063
  generalizations: 0.8062
Neighbors of description: Similarity
  primary: 0.8475
  syncretic: 0.8461
  starhawk: 0.8415
  make: 0.8372
  extensive: 0.8324
  list: 0.8302
  consequently: 0.8295
  worldview: 0.8292
  mysogyny: 0.8277
  prisons: 0.8266
Neighbors of johann: Similarity
  thinkers: 0.8229
  detailed: 0.8203
  hospital: 0.8202
  guattari: 0.8169
  methodological: 0




In [6]:
import numpy as np
import pickle

# 保存和加载预训练词嵌入向量，计算相似度
class WordEmbedding:
    def __init__(self, embedding_matrix):
        self.embedding_matrix = embedding_matrix

    def save_embeddings(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.embedding_matrix, f)

    def load_embeddings(self, filename):
        with open(filename, 'rb') as f:
            self.embedding_matrix = pickle.load(f)

    def get_similarity(self, word_id1, word_id2):
        vec1 = self.embedding_matrix[word_id1]
        vec2 = self.embedding_matrix[word_id2]
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [7]:
# 测试Word2Vec
embedding_matrix = w2v.W1
we = WordEmbedding(embedding_matrix)
we.save_embeddings("wordembedding.pkl")
we.load_embeddings("wordembedding.pkl")
similarity = we.get_similarity(dictionary["anarchism"], dictionary["originated"])
print(f"The Similarity of pejorative and describe: {similarity}")

The Similarity of pejorative and describe: 0.6548557201440164
