In [2]:
from gensim.models.keyedvectors import KeyedVectors

In [5]:
model = KeyedVectors.load_word2vec_format("..\\..\\SO_vectors_200.bin", binary=True)

In [9]:

import numpy as np


# Предположим, что model - это загруженная модель Word2Vec

# Метод 2: Поиск по теме
def find_words_by_theme(positive_words, negative_words, model, topn=10):
    """
    Находит слова, близкие к positive_words и далекие от negative_words.
    """
    try:
        similar_words = model.most_similar(positive=positive_words, negative=negative_words, topn=topn)
        return [word for word, score in similar_words]
    except KeyError as e:
        print(f"Одно из слов не найдено в словаре: {e}")
        return []


# Определяем тему "первая программа"
theme_positive = ['beginner', 'program', 'example', 'code', 'greeting', 'output', 'simple']
theme_negative = ['expert', 'hardware', 'complex', 'input', 'silence']

found_words = find_words_by_theme(theme_positive, theme_negative, model, topn=15)
print("Слова по теме 'первая программа':", found_words)

# Метод 3: Поиск по описанию
description_keywords = ['first', 'program', 'computer', 'greeting', 'planet', 'output', 'traditional']

# Создаем вектор-запрос как среднее векторов ключевых слов
query_vector = np.zeros(model.vector_size)
count = 0
for word in description_keywords:
    if word in model:
        query_vector += model[word]
        count += 1
if count > 0:
    query_vector /= count

    # Ищем ближайших соседей к вектору-запросу
    similar_to_query = model.similar_by_vector(query_vector, topn=15)
    print("\nСлова, близкие к описанию 'traditional first program...':")
    for word, score in similar_to_query:
        print(f"{word}: {score:.4f}")
else:
    print("Ни одно из ключевых слов не найдено в модели.")

Слова по теме 'первая программа': ['helloworld', 'hello2', 'hello_world', 'hello1', 'helloworld2', 'hello', 'test0', 'helloword', 'testprogram', 'hola', 'hello3', 'helloworld1', 'howdy', 'hello4', 'programe']

Слова, близкие к описанию 'traditional first program...':
program: 0.7470
computer: 0.6416
programme: 0.5723
programs: 0.5481
first: 0.5480
output: 0.5392
actually: 0.5260
just: 0.5172
hercules: 0.5080
programm: 0.5056
simulation: 0.5054
machine: 0.5011
process: 0.5010
pc: 0.4880
second: 0.4874


In [12]:
%%time
"""
 A word2vec model trained on Stack Overflow posts https://github.com/vefstathiou/SO_word2vec
"""
from gensim.models.keyedvectors import KeyedVectors

model = KeyedVectors.load_word2vec_format("..\\..\\SO_vectors_200.bin", binary=True)
theme_positive = ['beginner', 'program', 'example', 'code', 'output', 'simple']
theme_negative = ['expert', 'hardware', 'complex', 'input', 'silence']

model.most_similar(positive=theme_positive, negative=theme_negative, topn=1)


CPU times: total: 17.8 s
Wall time: 17.9 s


[('helloworld', 0.49995920062065125)]

In [30]:
model.most_similar(positive=['beginner', 'program', 'example', ], negative=['expert', 'hardware', 'complex', ], topn=1)


[('helloworld', 0.46487393975257874)]

In [33]:
model.most_similar(positive=['first', 'program', 'beginner', ], negative=['expert', 'hardware', 'complex', ], topn=10)


[('last', 0.5220977067947388),
 ('fist', 0.4842788577079773),
 ('next', 0.46736884117126465),
 ('frist', 0.45875605940818787),
 ('programe', 0.452897310256958),
 ('befor', 0.45036518573760986),
 ('prints', 0.4354223906993866),
 ('//now', 0.4354196786880493),
 ('immediatly', 0.4351065158843994),
 ('nr-1', 0.4339919984340668)]

In [35]:
%%time
"""
 A word2vec model trained on Stack Overflow posts https://github.com/vefstathiou/SO_word2vec
"""
from gensim.models.keyedvectors import KeyedVectors

model = KeyedVectors.load_word2vec_format("..\\..\\SO_vectors_200.bin", binary=True)

model.most_similar(positive=['beginner', 'program', 'example', ],
                   negative=['expert', 'hardware', 'complex', ], topn=1)

CPU times: total: 24.9 s
Wall time: 26.3 s


[('helloworld', 0.46487393975257874)]