In [2]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

  from ._conv import register_converters as _register_converters


In [3]:
import os
cwd = os.getcwd()
sourcefile = cwd + "/sources/wiki.xml"
page_tag = "{http://www.mediawiki.org/xml/export-0.10/}page"
text_tag= "{http://www.mediawiki.org/xml/export-0.10/}text"

In [4]:
import xml.etree.ElementTree as ET
tree = ET.parse(sourcefile)
root = tree.getroot()

In [5]:
texts = []

counter = 0
for elem in root.getiterator():
    if(elem.tag == text_tag):
        counter += 1
        texts.append(elem.text)

print('texts:', counter)

texts: 115541


In [40]:
limit = 500
valid_articles = texts[:limit]

In [41]:
sentences = []
for article in tqdm(valid_articles):
    for sentence in article.split('.'):
        if all(x.isalpha() or x.isspace() for x in sentence):
            sentences.append([word.lower() for word in sentence.split() if len(word)>0])
        
print('sentences:', len(sentences))
print(sentences[:10])

100%|██████████| 500/500 [00:00<00:00, 2068.62it/s]

sentences: 9841
[['opengroup'], ['awk', 'czyta', 'wejście', 'linia', 'po', 'linii'], ['każda', 'linia', 'jest', 'przeszukiwana', 'pod', 'kątem', 'wzorców', 'występujących', 'w', 'programie', 'i', 'dla', 'każdego', 'pasującego', 'wzorca', 'wykonywana', 'jest', 'akcja', 'z', 'nim', 'skojarzona'], ['computerworld'], ['com'], ['wzorce', 'są', 'sprawdzane', 'w', 'kolejności', 'ich', 'pojawienia', 'się', 'w', 'programie'], ['domyślną', 'akcją', 'jest', 'wypisanie', 'rekordu'], ['awk', 'posiada', 'wbudowane', 'wsparcie', 'dla', 'wielu', 'funkcji'], ['np'], ['interpretacja', 'wartości', 'zmiennej', 'zależy', 'od', 'kontekstu']]





In [42]:
words = []
for sentence in sentences:
    for word in sentence:
        words.append(word)
        
print('words:', len(words))
print(words[:10])

words: 29484
['opengroup', 'awk', 'czyta', 'wejście', 'linia', 'po', 'linii', 'każda', 'linia', 'jest']


In [43]:
words = set(words) # remove duplicates
vocab_size = len(words) # number of unique words

print('unique words:', vocab_size)

unique words: 11522


In [44]:
word2int = {}
int2word = {}

for i,word in enumerate(words):
    word2int[word] = i
    int2word[i] = word

In [45]:
print(int2word[4])

świadek


In [46]:
print(word2int['język'])

3620


In [47]:
data = []
WINDOW_SIZE = 2

for sentence in tqdm(sentences):
    for word_index, word in enumerate(sentence):
        for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] :
            if nb_word != word and nb_word:
                data.append([word, nb_word])

100%|██████████| 9841/9841 [00:00<00:00, 70366.74it/s]


In [48]:
print(data[:10])

[['awk', 'czyta'], ['awk', 'wejście'], ['czyta', 'awk'], ['czyta', 'wejście'], ['czyta', 'linia'], ['wejście', 'awk'], ['wejście', 'czyta'], ['wejście', 'linia'], ['wejście', 'po'], ['linia', 'czyta']]


In [49]:
word2int['awk']

5297

In [50]:
# function to convert numbers to one hot vectors
def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

x_train = [] # input word
y_train = [] # output word

for data_word in tqdm(data):
    x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size))
    y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size))

100%|██████████| 73830/73830 [00:11<00:00, 6253.77it/s]


In [51]:
# convert them to numpy arrays
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

print('done')

done


In [52]:
print(x_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [53]:
print(x_train.shape, y_train.shape)


(73830, 11522) (73830, 11522)


In [54]:
# making placeholders for x_train and y_train
x = tf.placeholder(tf.float32, shape=(None, vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

In [55]:
EMBEDDING_DIM = 5 # you can choose your own number
W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM])) # weights
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) # biases
hidden_representation = tf.add(tf.matmul(x,W1), b1)

In [56]:
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2))

In [57]:
# training
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) #make sure you do this!
# define the loss function:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))
# define the training step:
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)
n_iters = 10
# train for n_iter iterations
for _ in range(n_iters):
    sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
    print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))

loss is :  16.865911
loss is :  16.2043
loss is :  15.628169
loss is :  15.129018
loss is :  14.699047
loss is :  14.330756
loss is :  14.017068
loss is :  13.751227
loss is :  13.526936
loss is :  13.338434


In [65]:
print(sess.run(W1))

[[ 1.1337816e+00  2.3279121e+00  8.9919491e-04 -2.0784948e+00
   5.1683038e-01]
 [ 8.4016436e-01  1.1743989e+00 -2.1808462e-02  1.2650520e-01
   1.0124886e+00]
 [ 9.9099040e-01  7.2792476e-01  5.6595099e-01  3.4744936e-01
   2.0102081e-01]
 ...
 [ 1.3402855e+00  1.1284367e+00  8.7614310e-01 -1.4204627e+00
  -1.1074501e+00]
 [-9.9164270e-02  8.5978395e-01  4.3816090e-01  1.8480009e+00
  -1.4754275e+00]
 [-4.1760433e-01 -1.4795698e-01 -1.1452730e+00 -3.9435998e-01
  -1.1138235e+00]]


In [66]:
vectors = sess.run(W1 + b1)

In [67]:
print(vectors[word2int['język'] ])

[-0.24391964 -1.3446827   0.08939427 -1.0694506   1.6355885 ]


In [68]:
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum((vec1-vec2)**2))

def find_closest(word_index, vectors):
    min_dist = 10000 # to act like positive infinity
    min_index = -1
    query_vector = vectors[word_index]
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
    return min_index

In [69]:
print(int2word[find_closest(word2int['polska'], vectors)])


publikacja


In [70]:
from scipy import spatial
tree = spatial.KDTree(vectors)

In [93]:
v1 = vectors[word2int['mieć']]
v2 = vectors[word2int['miał']]
v3 = vectors[word2int['był']]
int2word[tree.query(v1-v2+v3)[1]]
# expected: być

'wersją'

In [94]:
v1 = vectors[word2int['ona']]
v2 = vectors[word2int['jej']]
v3 = vectors[word2int['jego']]
int2word[tree.query(v1-v2+v3)[1]]
# expected: on

'news'