In [2]:
import os
from gensim.models import Word2Vec


# We will use LineSentence rom Gensim. This is a smart utility that reads the file line-by-line 
# instead of loading the whole thing into RAM (It will be good, if we have 100GB or more data later)
from gensim.models.word2vec import LineSentence

In [3]:
# Define Paths:

PROCESSED_PATH = os.path.join("..", "data", "processed")

MODEL_PATH = os.path.join("..", "data", "models")

# We will make sure folder model exists, if not we will create one:

os.makedirs(MODEL_PATH, exist_ok=True)


In [7]:
corpus_file = os.path.join(PROCESSED_PATH,"god_father_corpus.txt")

sentences = LineSentence(corpus_file)

print("Training Word2Vec model...")

# Build the model

model = Word2Vec(
    sentences= sentences, 
    vector_size=100, 
    window=5, 
    min_count = 2, 
    workers = 4
)


print("Training complete!")

Training Word2Vec model...
Training complete!


In [17]:
# Now, since the model is trained, we will find words most similar to something 


def find_word(word):

    if word in model.wv:
        print(f"Most similar to '{word}': ")

        similar_words = model.wv.most_similar(word)

        for w, score in similar_words:
            print(f"{w:^20}: {score:.4f}")

    else:
        print(f"Word '{word}' not found in vocabulary.")

    print("\n", "="*100, "\n")


# We will find similarity between two words as well:


def find_similarity(w1, w2):
    if w1 in model.wv and w2 in model.wv:
        similarity = model.wv.similarity(w1, w2)

        print(f"Similarity between '{w1}' and '{w2}' is: {similarity:.4f}")
    else:
        print("One of the word not in vocabulary!")

In [10]:
find_word("godfather")

Most similar to 'godfather': 
        mind        : 0.9883
        boy         : 0.9834
        name        : 0.9809
      question      : 0.9771
       doctor       : 0.9758
        plan        : 0.9741
        word        : 0.9738
       story        : 0.9734
      vincent       : 0.9733
        yes         : 0.9730




In [11]:
find_word("don")

Most similar to 'don': 
        fred        : 0.8398
     fantastic      : 0.8109
        didn        : 0.7953
       favor        : 0.7804
       doesn        : 0.7766
       insist       : 0.7704
       fellas       : 0.7699
      sternly       : 0.7682
      forgive       : 0.7657
        why         : 0.7652




In [15]:
find_similarity("kay", "don")

Similarity between 'kay' and 'don' is: 0.6787


In [20]:
find_similarity("gun", "italy")

Similarity between 'gun' and 'italy' is: 0.9037


<pre style = "text-align: center; font-size: 20px">

ðŸŽŒLet's Save our Model now...ðŸŽŒ



In [21]:
model_filename = os.path.join(MODEL_PATH, "godfather_w2v.model")

model.save(model_filename)

print(f"Model saved to {model_filename}")

Model saved to ..\data\models\godfather_w2v.model
