###  Training a model on your own dataset using the Gensim library with the Word2Vec model.

In [None]:

import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
import gensim

In [None]:
from pathlib import Path
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

data_path = Path("/kaggle/input/game-of-thrones-books")

story = []

for txt_file in sorted(data_path.glob("*.txt")):
    
    # Read file safely
    with txt_file.open("r", encoding="utf-8", errors="ignore") as f:
        corpus = f.read()
    
    # Sentence tokenization
    sentences = sent_tokenize(corpus)
    
    # Preprocess each sentence and save
    for sent in sentences:
        story.append(simple_preprocess(sent))   # tokenized + cleaned + lowercased list



In [None]:
len(story)

In [None]:
story

In [None]:
model = gensim.models.Word2Vec(
    window = 10,  # 10 words on left , 10 words at right and target word at middle
    min_count = 2  # we take those sentences which have atleast two words
)

# press shift + tab to see all the parameters

In [None]:
model.build_vocab(story) # extracting the vocabulary

In [None]:
model.train(story , total_examples = model.corpus_count, epochs=model.epochs)

# story -> corpus
# total_examples -> total no. of sentences in corpus
# model.corpus_count is automatically computed from the corpus you passed to build_vocab()


In [None]:
model.wv.most_similar('daenerys')  # wv means word2vec

In [None]:
model.wv.most_similar('jon')

In [None]:
model.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran']) # returns the word that does not fit with the others according to the trained Word2Vec embeddings.

In [None]:
model.wv.doesnt_match(['cersei','jaime','tyrion','ned'])

In [None]:
model.wv['dragon'] # vector representaion of the word

In [None]:
model.wv.similarity('jon','daenerys')

In [None]:
model.wv.similarity('jon','king')

In [None]:
model.wv.similarity('bran','king')

In [None]:
model.wv.similarity('arya','sansa')

In [None]:
y = model.wv.index_to_key
y

In [None]:
model.wv.get_normed_vectors() # returns a matrix of all word vectors

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)  # reduce the word vectors to 3 dimensions.

X = pca.fit_transform(model.wv.get_normed_vectors())


In [None]:
X.shape


In [None]:
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])
fig.show()

# x -> PCA-reduced 3D vectors of all words	(vocab_size, 3) 
# y -> List of words in the vocabulary (labels)	(vocab_size,) (list of str)
# y[i] corresponds to X[i].