# Potter2Vec

In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [3]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


**Set up logging**

In [5]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

**Download NLTK tokenizer models (only the first time)**

In [6]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ahmetihsan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ahmetihsan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Metnin Hazırlanması

**Kitaplar klasörden okunur**

In [7]:
book_filenames = sorted(glob.glob("data/*.txt"))

In [8]:
print("Found books:")
book_filenames

Found books:


['data/Atsiz.txt',
 'data/book1.txt',
 'data/book2.txt',
 'data/book3.txt',
 'data/book4.txt',
 'data/book5.txt',
 'data/book6.txt',
 'data/book7.txt',
 'data/sherlock.txt',
 'data/sherlockfull.txt']

**Bütün kitaplar tek bir yerde birleştirilir**

In [9]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'data/Atsiz.txt'...
Corpus is now 868589 characters long

Reading 'data/book1.txt'...
Corpus is now 1343008 characters long

Reading 'data/book2.txt'...
Corpus is now 1874663 characters long

Reading 'data/book3.txt'...
Corpus is now 2341624 characters long

Reading 'data/book4.txt'...
Corpus is now 3528866 characters long

Reading 'data/book5.txt'...
Corpus is now 3770350 characters long

Reading 'data/book6.txt'...
Corpus is now 4829355 characters long

Reading 'data/book7.txt'...
Corpus is now 5515892 characters long

Reading 'data/sherlock.txt'...
Corpus is now 6082099 characters long

Reading 'data/sherlockfull.txt'...
Corpus is now 9667369 characters long



**NLTK Kütüphanesi ile anlama etkisi olmayan kelimeler temizlenir**

In [10]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [11]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [12]:
#convert into a list of words
#rtemove unnnecessary,, split into words, no hyphens
#list of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [13]:
#sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [14]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 1,767,533 tokens


## Word2Vec Oluşturulması ve Eğitilmesi

In [15]:
#ONCE we have vectors
#step 3 - build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

In [16]:
potter2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [17]:
potter2vec.build_vocab(sentences)

2020-01-02 18:56:10,952 : INFO : collecting all words and their counts
2020-01-02 18:56:10,955 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-01-02 18:56:11,003 : INFO : PROGRESS: at sentence #10000, processed 125048 words, keeping 11228 word types
2020-01-02 18:56:11,042 : INFO : PROGRESS: at sentence #20000, processed 280824 words, keeping 21123 word types
2020-01-02 18:56:11,086 : INFO : PROGRESS: at sentence #30000, processed 449107 words, keeping 26345 word types
2020-01-02 18:56:11,131 : INFO : PROGRESS: at sentence #40000, processed 622777 words, keeping 29935 word types
2020-01-02 18:56:11,180 : INFO : PROGRESS: at sentence #50000, processed 812856 words, keeping 33237 word types
2020-01-02 18:56:11,226 : INFO : PROGRESS: at sentence #60000, processed 993114 words, keeping 35881 word types
2020-01-02 18:56:11,266 : INFO : PROGRESS: at sentence #70000, processed 1140477 words, keeping 39358 word types
2020-01-02 18:56:11,320 : INFO : PROGRESS: at

In [18]:
print("Word2Vec vocabulary length:", len(potter2vec.wv.vocab))

Word2Vec vocabulary length: 21342


**Start training, this might take a minute or two...**

In [19]:
potter2vec.train(sentences,total_examples=potter2vec.corpus_count, epochs=potter2vec.iter)

  """Entry point for launching an IPython kernel.
2020-01-02 18:44:00,762 : INFO : training model with 4 workers on 21342 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=7
2020-01-02 18:44:01,893 : INFO : EPOCH 1 - PROGRESS: at 15.25% examples, 164981 words/s, in_qsize 7, out_qsize 0
2020-01-02 18:44:02,995 : INFO : EPOCH 1 - PROGRESS: at 29.01% examples, 166897 words/s, in_qsize 7, out_qsize 0
2020-01-02 18:44:04,095 : INFO : EPOCH 1 - PROGRESS: at 40.92% examples, 160541 words/s, in_qsize 8, out_qsize 0
2020-01-02 18:44:05,102 : INFO : EPOCH 1 - PROGRESS: at 51.61% examples, 162384 words/s, in_qsize 8, out_qsize 0
2020-01-02 18:44:06,123 : INFO : EPOCH 1 - PROGRESS: at 61.18% examples, 158480 words/s, in_qsize 8, out_qsize 0
2020-01-02 18:44:07,191 : INFO : EPOCH 1 - PROGRESS: at 73.08% examples, 153169 words/s, in_qsize 8, out_qsize 0
2020-01-02 18:44:08,210 : INFO : EPOCH 1 - PROGRESS: at 81.54% examples, 151369 words/s, in_qsize 7, out_qsize 0
2020-01-0

(6872229, 8837665)

**Eğitilen modelin Kaydedilmesi**

In [20]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [21]:
potter2vec.save(os.path.join("trained", "potter2vec.w2v"))

2020-01-02 18:44:44,322 : INFO : saving Word2Vec object under trained/potter2vec.w2v, separately None
2020-01-02 18:44:44,325 : INFO : not storing attribute vectors_norm
2020-01-02 18:44:44,328 : INFO : not storing attribute cum_table
2020-01-02 18:44:44,941 : INFO : saved trained/potter2vec.w2v


## Kaydedilen Modelin Yüklenmesi.

In [22]:
potter2vec = w2v.Word2Vec.load(os.path.join("trained", "potter2vec.w2v"))

2020-01-02 18:44:44,951 : INFO : loading Word2Vec object from trained/potter2vec.w2v
2020-01-02 18:44:45,370 : INFO : loading wv recursively from trained/potter2vec.w2v.wv.* with mmap=None
2020-01-02 18:44:45,372 : INFO : setting ignored attribute vectors_norm to None
2020-01-02 18:44:45,378 : INFO : loading vocabulary recursively from trained/potter2vec.w2v.vocabulary.* with mmap=None
2020-01-02 18:44:45,380 : INFO : loading trainables recursively from trained/potter2vec.w2v.trainables.* with mmap=None
2020-01-02 18:44:45,380 : INFO : setting ignored attribute cum_table to None
2020-01-02 18:44:45,382 : INFO : loaded trained/potter2vec.w2v


### TSNE ile Eğitilen Modelin Görselleştirilmesi

In [23]:
#my video - how to visualize a dataset easily
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

In [24]:
all_word_vectors_matrix = potter2vec.wv.syn0

  """Entry point for launching an IPython kernel.


**Metinde geçen kelimelerin tsne ile koordinatlarının belirlenmesi**

In [None]:
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [None]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[potter2vec.wv.vocab[word].index])
            for word in potter2vec.wv.vocab
        ]
    ],
    columns=["word", "x", "y"]
)

In [None]:
points.head(10)

In [None]:
sns.set_context("poster")

In [None]:
points.plot.scatter("x", "y", s=10, figsize=(20, 12))

In [None]:
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

In [None]:
plot_region(x_bounds=(0.0, 5.2), y_bounds=(-0.5, -0.1))

In [None]:
plot_region(x_bounds=(0, 1.25), y_bounds=(0, 1.25))

### Verilen kelimeler arasında anlamsal ilişkilerin keşfedilmesi

In [None]:
potter2vec.most_similar("Hogwarts")

In [None]:
potter2vec.most_similar("Severus")

In [None]:
potter2vec.most_similar("Lee")

In [None]:
potter2vec.most_similar("Quidditch")

In [None]:
potter2vec.most_similar("Potter")

**Kelime çiftleri arasındaki pozitif-negatif ilişkinin keşfedilmesi**

In [None]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = potter2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [None]:
nearest_similarity_cosmul("Severus", "Minerva", "Ron")
nearest_similarity_cosmul("Ron", "Potter", "Hermione")
nearest_similarity_cosmul("Dumbledore", "McGonagall", "Sirius")