In [1]:
# Autoreload allows for automatic reloading of imported modules
%load_ext autoreload
# Load main using autoreload
%aimport main
# Automatically reload all things imported using autoreload
%autoreload 1

import tensorflow as tf
import keras
from main import *

Using TensorFlow backend.


In [2]:
emb = Embedding(10,10)
emb.build(None)
print(emb.call(10))
print(emb.trainable_variables)

Tensor("embedding_lookup/Identity:0", shape=(10,), dtype=float32)
[<tf.Variable 'embedding/embedding:0' shape=(10, 10) dtype=float32_ref>]


In [3]:
path = "/home/martoko/Downloads/ud/ud-treebanks-v2.3/UD_German-PUD/de_pud-ud-test.conllu"
sentences = main.Sentences(path)

In [4]:
print("### SENTENCE LENGTH ###")
print(sentences.sentence_length)
print("### SAMPLE WORDS ###")
print(sentences.words()[25][0:5])
print("### SAMPLE TAGS ###")
print(sentences.tags()[25][0:5])

### SENTENCE LENGTH ###
56
### SAMPLE WORDS ###
['Osborne', 'meldete', 'sich', 'bei', 'einer']
### SAMPLE TAGS ###
['PROPN', 'VERB', 'PRON', 'ADP', 'DET']


In [5]:
print("### PADDED SAMPLE WORDS ###")
print([[word[0:2] for word in group] for group in sentences.padded_words()[0:2]])
print("### PADDED SAMPLE TAGS ###")
print([[tag[0:2] for tag in group] for group in sentences.padded_tags()[0:2]])

### PADDED SAMPLE WORDS ###
[['„', 'Ei', 'Gr', 'de', 'di', 'Üb', 'is', 'fü', 'di', 'Ve', 'St', 'ne', ',', 'ei', 'fr', 'Ma', 'hi', 'ni', '“', ',', 'sc', 'Ob', 'So', 'Ko', 'Sc', 'am', 'an', 'de', 'Mo', 'in', 'ei', 'Bl', '.', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__'], ['Fü', 'al', ',', 'di', 'So', '-', 'Me', '-', 'Üb', 'au', 'de', 'Ca', 'Hi', 've', ',', 'wi', 'di', 'Üb', 'ei', 'we', 'an', 'se', '.', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__']]
### PADDED SAMPLE TAGS ###
[['PU', 'DE', 'NO', 'DE', 'AD', 'NO', 'AU', 'AD', 'DE', 'AD', 'NO', 'AD', 'PU', 'DE', 'AD', 'NO', 'AD', 'AD', 'PU', 'PU', 'VE', 'PR', 'NO', 'PR', 'PR', '_', 'AD', 'DE', 'NO', 'AD', 'DE', 'NO', 'PU', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '__', '

In [6]:
print("### TRAINING DATA LENGTH ###")
print(len(sentences.training_word_ids), len(sentences.training_tag_ids))
print()

print("### VALIDATION DATA LENGTH ###")
print(len(sentences.validation_word_ids), len(sentences.validation_tag_ids))
print()

print("### TEST DATA LENGTH ###")
print(len(sentences.test_word_ids), len(sentences.test_tag_ids))
print()

### TRAINING DATA LENGTH ###
900 900

### VALIDATION DATA LENGTH ###
50 50

### TEST DATA LENGTH ###
50 50



In [7]:
print("### SAMPLE WORDS ###")
print(sentences.test_words()[3][0:5])
print(sentences.test_word_ids[3][0:5])
print()

print("### SAMPLE TAGS ###")
print(sentences.test_tags()[3][0:5])
print(sentences.test_tag_ids[3][0:5])
print()

print("### WORD DICTIONARY SAMPLE ###")
print(list(sentences.id_by_word.items())[0:5])
print(sentences.word_count, "words")
print()

print("### TAG DICTIONARY SAMPLE ###")
print(list(sentences.id_by_tag.items())[0:5])
print()

### SAMPLE WORDS ###
['Ein', 'Element', ',', 'das', 'diese']
[2, 1216, 13, 83, 175]

### SAMPLE TAGS ###
['DET', 'NOUN', 'PUNCT', 'PRON', 'DET']
[2, 3, 1, 12, 2]

### WORD DICTIONARY SAMPLE ###
[('__unk__', 0), ('„', 1), ('Ein', 2), ('Großteil', 3), ('des', 4)]
6185 words

### TAG DICTIONARY SAMPLE ###
[('__unk__', 0), ('PUNCT', 1), ('DET', 2), ('NOUN', 3), ('ADJ', 4)]



In [8]:
from tensorflow import Session

embedding_dimensions = 5
input = [1,1,1,2,3,4,5]
embedding = Embedding(len(input), embedding_dimensions)(input)

model = embedding
print("### SHAPE ###")
print("Shape: (ENTRIES, DIMENSIONS)")
print("Shape:", model.shape)
print()

session = Session()
# Initialize unitialized variables in layers
# In this case just the embedding layer
print("### EMBEDDING EXAMPLE ###")
session.run(tf.global_variables_initializer())
print(session.run(model))
session.close()

### SHAPE ###
Shape: (ENTRIES, DIMENSIONS)
Shape: (7, 5)

### EMBEDDING EXAMPLE ###
[[-0.33783126 -0.3548144  -0.38183865 -0.53107405  0.41663224]
 [-0.33783126 -0.3548144  -0.38183865 -0.53107405  0.41663224]
 [-0.33783126 -0.3548144  -0.38183865 -0.53107405  0.41663224]
 [-0.01512766 -0.4023714  -0.66070426 -0.60552883 -0.19932711]
 [-0.5685188   0.42804068  0.6346535  -0.13262516 -0.5055536 ]
 [-0.02324939 -0.70452875 -0.16063023 -0.39289427 -0.2674307 ]
 [-0.25600165 -0.5676973  -0.3175895  -0.5494278   0.46528214]]


In [11]:
from keras.layers import Dense

# Our input to the model
word_ids = sentences.training_word_ids
sentence_length = sentences.sentence_length

# The first part of the model translates ids to vectors
embedding_dimensions = 16
embedding = Embedding(sentences.word_count, embedding_dimensions)(word_ids)
print("### EMBEDDING SHAPE ###")
print("Embedding shape: (SENTENCES, WORDS, DIMENSIONS)")
print("Embedding shape:", embedding.shape)
print()

# The second part is just a bunch of dense layers
dense = Dense(sentences.tag_count)(embedding)
print("### DENSE SHAPE ###")
print("Dense shape: (SENTENCES, WORDS, DIMENSIONS)")
print("Dense shape:", dense.shape)
print()

session = Session()
# Initialize unitialized variables in layers
# In this case just the embedding layer
print("### RUN ###")
session.run(tf.global_variables_initializer())
print(session.run(dense))
session.close()

### EMBEDDING SHAPE ###
Embedding shape: (SENTENCES, WORDS, DIMENSIONS)
Embedding shape: (900, 56, 16)

### DENSE SHAPE ###
Dense shape: (SENTENCES, WORDS, DIMENSIONS)
Dense shape: (900, 56, 19)

### RUN ###
[[[-5.09209372e-03  1.50791300e-03 -4.93746251e-03 ... -1.89534873e-02
    1.36761302e-02 -4.72974638e-03]
  [ 9.46029276e-03  5.66457910e-03 -5.51265851e-03 ... -3.83946998e-03
   -1.07055688e-02 -2.00245306e-02]
  [ 8.24543461e-03 -2.45409319e-03 -2.57360619e-02 ...  1.64704807e-02
    3.19728479e-02  2.35756561e-02]
  ...
  [-6.99469727e-03 -1.94459073e-02 -1.61688332e-03 ... -3.60888150e-03
   -1.81634948e-02 -9.54899099e-03]
  [-6.99469727e-03 -1.94459073e-02 -1.61688332e-03 ... -3.60888150e-03
   -1.81634948e-02 -9.54899099e-03]
  [-6.99469727e-03 -1.94459073e-02 -1.61688332e-03 ... -3.60888150e-03
   -1.81634948e-02 -9.54899099e-03]]

 [[-2.66476944e-02  1.64068770e-02 -1.69292446e-02 ... -6.20297249e-03
   -7.58355483e-04  8.49461742e-03]
  [ 1.61188329e-03  5.35426149e-03 

In [10]:
model = emb
# lstm = Lstm(num_units=10)
# lstm.call(id_by_word, lstm.zero_state(3, dtype=tf.float32))
# lstm.output_shape