In [16]:
import json
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import multiprocessing
import re
from spacy.lang.en import English # updated

In [20]:
nlp = English()
nlp.add_pipe('sentencizer') # updated

['Hello, world.', 'Here are two sentences.']

In [31]:
def prepare_data(filename):
    with open(filename,"r",encoding="utf-8") as f:
        text = f.read().split("\n\n")
    word = []
    allsegment = []
    for segment in text:
        segment = segment.strip()
        segment = segment.replace("\n"," ")
        segment = re.sub(r'[^\w\s]', '', segment)
        
        doc = nlp(segment)
        sentences = [sent.text.split() for sent in doc.sents]
        allsegment = allsegment + sentences
    return(allsegment)
    

def training(model_name, filename):
    sentences = prepare_data(filename)

    cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(
        min_count=5,
        window=2,
        vector_size=200,
        sample=6e-5,
        alpha=0.03,
        min_alpha=0.0007,
        negative=20,
        seed = 11)
    w2v_model.build_vocab(sentences)
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=100)
    w2v_model.save(f"word_vectors/{model_name}.model")
    w2v_model.wv.save_word2vec_format(f"word_vectors/word2vec_{model_name}.model")
    
training("model_hp", "hpchapter1.txt")


In [32]:
def gen_sim(word):
    model = KeyedVectors.load_word2vec_format("word_vectors/word2vec_model_hp.model",
                                             binary=False)
    results = model.most_similar(positive=[word])
    print(results)

gen_sim("Harry")

[('Ron', 0.6954762935638428), ('He', 0.641409695148468), ('quickly', 0.6208124160766602), ('eagerly', 0.606089174747467), ('he', 0.5928812026977539), ('moaned', 0.5855399966239929), ('himself', 0.5854555368423462), ('Like', 0.5841359496116638), ('stared', 0.5773068070411682), ('struck', 0.5763301253318787)]


In [33]:
gen_sim("Gryffindor")

[('fifty', 0.8320811986923218), ('points', 0.8275266289710999), ('Slytherin', 0.8079466819763184), ('winning', 0.805392861366272), ('lose', 0.7723042964935303), ('taken', 0.756479024887085), ('win', 0.7530296444892883), ('Slytherins', 0.7512549757957458), ('won', 0.7457296848297119), ('house', 0.7346804141998291)]
