# Word2vec using gensim on Game of thrones novels

1. First we download all the files (novels) into a single folder.
2. We then open each file (novels) and add them into a single string file called story

In [1]:
import os
path = r"C:\Users\rezaa\Desktop\archive"
file_list = os.listdir(path)
story = ''
for each_file in file_list:
    with open (f"./Desktop/archive/{each_file}") as file:
        data = file.read().lower()
        story += data
story[:500]

'a game of thrones \nbook one of a song of ice and fire \nby george r. r. martin \nprologue \n"we should start back," gared urged as the woods began to grow dark around them. "the wildlings are \ndead." \n"do the dead frighten you?" ser waymar royce asked with just the hint of a smile. \ngared did not rise to the bait. he was an old man, past fifty, and he had seen the lordlings come and go. \n"dead is dead," he said. "we have no business with the dead." \n"are they dead?" royce asked softly. "what proof '

3. We use gensim method called remove_stopwords as it is very efficient in removing stopwords

In [2]:
from gensim.parsing.preprocessing import remove_stopwords
story = remove_stopwords(story)
story[:500]

'game thrones book song ice george r. r. martin prologue "we start back," gared urged woods began grow dark them. "the wildlings dead." "do dead frighten you?" ser waymar royce asked hint smile. gared rise bait. old man, past fifty, seen lordlings come go. "dead dead," said. "we business dead." "are dead?" royce asked softly. "what proof we?" "will saw them," gared said. "if says dead, that\'s proof me." known drag quarrel sooner later. wished later sooner. "my mother told dead men sing songs," in'

4. We then tokenize each sentence so that we can apply simple_preprocessing

In [3]:
from nltk import sent_tokenize
story = sent_tokenize(story)
story[:10]

['game thrones book song ice george r. r. martin prologue "we start back," gared urged woods began grow dark them.',
 '"the wildlings dead."',
 '"do dead frighten you?"',
 'ser waymar royce asked hint smile.',
 'gared rise bait.',
 'old man, past fifty, seen lordlings come go.',
 '"dead dead," said.',
 '"we business dead."',
 '"are dead?"',
 'royce asked softly.']

5. Simple_preprocessing requires tokenized sentences as input and the output is word tokenize

In [4]:
from gensim.utils import simple_preprocess
story_update = []
for sent in story:
    story_update.append(simple_preprocess(sent))
story_update[:10]

[['game',
  'thrones',
  'book',
  'song',
  'ice',
  'george',
  'martin',
  'prologue',
  'we',
  'start',
  'back',
  'gared',
  'urged',
  'woods',
  'began',
  'grow',
  'dark',
  'them'],
 ['the', 'wildlings', 'dead'],
 ['do', 'dead', 'frighten', 'you'],
 ['ser', 'waymar', 'royce', 'asked', 'hint', 'smile'],
 ['gared', 'rise', 'bait'],
 ['old', 'man', 'past', 'fifty', 'seen', 'lordlings', 'come', 'go'],
 ['dead', 'dead', 'said'],
 ['we', 'business', 'dead'],
 ['are', 'dead'],
 ['royce', 'asked', 'softly']]

### First we train the model 

In [5]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=story_update, vector_size=100, window=5, min_count=1, workers=4)
model.build_vocab(story_update)
model.train(story_update, total_examples=model.corpus_count, epochs=model.epochs)

(4388610, 4537925)

#### Finding similar words 

In [6]:
similar_words = model.wv.most_similar('dragon')
similar_words

[('dragons', 0.6404169797897339),
 ('brains', 0.6011684536933899),
 ('dothrak', 0.5870971083641052),
 ('aegon', 0.5832668542861938),
 ('crown', 0.5812475681304932),
 ('representing', 0.5771254301071167),
 ('stag', 0.5762042999267578),
 ('pus', 0.5751556158065796),
 ('basilisk', 0.5686959624290466),
 ('targaryen', 0.5686072111129761)]

#### Finding vector of each word 

In [7]:
vector = model.wv['dragon']
vector

array([ 0.8932098 ,  0.4198481 ,  0.27546474,  0.03776285,  0.02175228,
        0.20497489, -0.7773575 ,  1.1775765 , -0.578238  ,  0.09103923,
        0.08330283, -0.58179116, -0.5093285 ,  1.2699273 ,  1.30167   ,
       -0.53904563,  0.4913404 ,  0.2341057 , -0.02014115,  0.74679977,
        0.84635025,  0.8059916 ,  0.08734256, -0.60105217,  0.06445137,
        0.21283475,  0.5400728 , -0.26769736,  0.21232374, -0.5499789 ,
        0.07936369,  0.16855317, -0.58067536, -0.29445884,  0.35307676,
        0.17313826, -0.08015878, -0.30926928, -1.209362  , -1.5707222 ,
        0.7549244 , -0.30093107, -0.6556581 ,  1.2819319 ,  0.94630164,
       -0.7193925 , -1.2899014 , -0.1922533 , -0.21742578, -0.05111859,
       -1.630144  , -1.1053007 , -0.19955777, -0.18827048,  0.04603759,
        0.7518476 , -1.1574765 ,  0.23055592, -0.04636365,  1.1763182 ,
       -0.19387819,  0.742983  ,  0.9241543 , -0.33490095, -0.8885722 ,
       -0.02291221, -0.20728998,  0.8777483 ,  1.2759347 , -0.44