# Keras Testing

To start out we will implement the text generation example from https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py . We can then begin to adapt this once we understand the dynamics of keras.

In [3]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import os
import pickle

Using TensorFlow backend.


The keras example has ~ 600k characters. What number of patent documents do we need to recreate this number?  

10,000 patent documents gives you ~ 600 million characters. 600m/600k = 1k. Hence, would only need 10 patent documents to provide 600k characters?

Also keras example only has 60 characters all lower case.

In [4]:
# Load our list of G06 records
PIK = "G06records.data"

if os.path.isfile(PIK):
    with open(PIK, "rb") as f:
        print("Loading data")
        records = pickle.load(f)
        print("{0} records loaded".format(len(records)))
else:
    records = ds.get_records(["G", "06"])
    with open(PIK, "wb") as f:
        pickle.dump(records, f)

Loading data
554570 records loaded


In [5]:
# Get data from 100 random descriptions across the data
records_random_sample = random.sample(records, 100)
print("Random sample of {0} records".format(len(records_random_sample)))
print(records_random_sample[0:5])

Random sample of 100 records
[(875490, '2005/I20050616.ZIP', './I20050616/UTIL0132/US20050132014A1-20050616.ZIP'), (3257714, '2013/I20130502.tar', 'I20130502/UTIL0107/US20130107322A1-20130502.ZIP'), (3520859, '2013/I20131010.tar', 'I20131010/UTIL0266/US20130266193A1-20131010.ZIP'), (3694334, '2014/I20140410.tar', 'I20140410/UTIL0098/US20140098480A1-20140410.ZIP'), (3944527, '2015/I20150219.tar', 'I20150219/UTIL0052/US20150052046A1-20150219.ZIP')]


In [8]:
from patentdata.corpus import USPublications
# Probably need to move the patentcorpus.py file into the main patentdata directory
from patentdata.models.patentcorpus import LazyPatentCorpus

path = '/media/SAMSUNG1/Patent_Downloads'
ds = USPublications(path)

lzy = LazyPatentCorpus()
lzy.init_by_filenames(ds, records_random_sample)

In [10]:
text[0:1000]

'The invention relates to a method of controlling a touch-surface control device characterized in that it comprises a step of shape recognition of a control trajectory on a touch surface from among at least two predefined shapes of trajectory, in which in the course of a predetermined duration (dT): the control trajectory is sampled in order to determine a sampled angle (dθ) of the control trajectory for each sampling period (Te), a parameter representative of the evolution of at least two sampled angles (dθ) is compared with a predetermined threshold, and a predefined trajectory shape is assigned to the control trajectory as a function of the result of the comparison. The invention also relates to a control device comprising a tough surface characterized in that it comprises a processing unit for implementing a control method as described above.\nThe present invention relates to a method of controlling a touch-surface control device. The invention also relates to a corresponding touch

In [22]:
# Load description text randomly gathered from 100 G06 records
PIK = "G06desc_text_100.data"

if os.path.isfile(PIK):
    with open(PIK, "rb") as f:
        print("Loading data")
        text = pickle.load(f)
        print("Text data of length {0}".format(len(text)))
else:
    # Convert that random sample of records into one long text string (takes a while)
    text = lzy.get_description_text()
    with open(PIK, "wb") as f:
        pickle.dump(text, f)

Loading data
Text data of length 5809183


In [24]:
chars

['\n',
 ' ',
 '!',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 '\\',
 ']',
 '^',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 '§',
 '©',
 '®',
 '°',
 '±',
 '·',
 '½',
 '×',
 'é',
 'ö',
 '÷',
 '˜',
 'Δ',
 'Σ',
 'Ω',
 'α',
 'β',
 'γ',
 'δ',
 'ε',
 'θ',
 'κ',
 'λ',
 'μ',
 'ν',
 'π',
 'σ',
 'τ',
 '\u2002',
 '\u2003',
 '\u2009',
 '—',
 '‘',
 '’',
 '“',
 '”',
 '•',
 '…',
 '′',
 '″',
 '\u2061',
 '\u2062',
 '™',
 'ⅇ',
 '⅓',
 '⅔',
 '⅛',
 '←',
 '↑',
 '→',
 '↓',
 '⇀',
 '∀',
 '∂',
 '∈',
 '∑',
 '−',
 '∥',
 '∫',
 '≅',
 '≈',
 '≠',
 '≤',
 '≦',
 '≧',
 '⊂',
 '⊕',
 '⌊',
 

Original data set has 175 characters. This may be too large for the present example. If we convert to lower case and only keep a small subset of characters.

In [25]:
import string
characters_to_keep = string.ascii_lowercase + string.digits + "".join([" ", ".", ",", ":", ";", "(", ")", "\n"])
print(characters_to_keep)
print("Characters to keep are of length: {0}".format(len(characters_to_keep)))

abcdefghijklmnopqrstuvwxyz0123456789 .,:;()

Characters to keep are of length: 44


In [26]:
# Filter text to only keep characters in the list above
text = text.lower()
text = "".join([c if c in characters_to_keep else " " for c in text])

In [27]:
text[0:1000]

'the invention relates to a method of controlling a touch surface control device characterized in that it comprises a step of shape recognition of a control trajectory on a touch surface from among at least two predefined shapes of trajectory, in which in the course of a predetermined duration (dt): the control trajectory is sampled in order to determine a sampled angle (d ) of the control trajectory for each sampling period (te), a parameter representative of the evolution of at least two sampled angles (d ) is compared with a predetermined threshold, and a predefined trajectory shape is assigned to the control trajectory as a function of the result of the comparison. the invention also relates to a control device comprising a tough surface characterized in that it comprises a processing unit for implementing a control method as described above.\nthe present invention relates to a method of controlling a touch surface control device. the invention also relates to a corresponding touch

In [30]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 44


In [28]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    # sentences is a list of text segments having 40 characters
    sentences.append(text[i: i + maxlen])
    # next chars is a list of the character occuring after the block of 40 characters (ground truth label for output)
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 1936381


In [29]:
print(sentences[0:5])
print(next_chars[0:5])

['the invention relates to a method of con', ' invention relates to a method of contro', 'vention relates to a method of controlli', 'tion relates to a method of controlling ', 'n relates to a method of controlling a t']
['t', 'l', 'n', 'a', 'o']


In [31]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        # One hot encoding of character index
        X[i, t, char_indices[char]] = 1
    # comparison data - one hot encoding of next character
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [32]:
X[0:5]

array([[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False,  True, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, 

In [33]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
# Has hidden dimension 128 - input shape is 40*59
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
# Dense is a regular densely connected NN layer of dimension length 59
model.add(Dense(len(chars)))
# Then a softmax layer on the output of the densely connected layer
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [34]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
# train the model, output generated text after each iteration
for iteration in range(1, 60):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y,
              batch_size=128,
              epochs=1)

    # Pick a random start index in the text 
    start_index = random.randint(0, len(text) - maxlen - 1)

    # What is diversity?
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        # This is a random set of 40 characters from the text
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        # Repeat predictions for 400 characters?
        for i in range(400):
            # x is a one hot encoding of the characters in maxlen segment
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            # Perform prediction based on x
            preds = model.predict(x, verbose=0)[0]
            # Get next index using sample function
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


--------------------------------------------------
Iteration 1
Epoch 1/1

----- diversity: 0.2
----- Generating with seed: "real time data.
selecting a module assoc"
real time data.
selecting a module assoc            t              n   r    n r           o  o          t     t               r          tt   t      rt    r    tro    t   t   t            tr          r            tn   r    n    t   o  t      o         r   t   trn             r    r       tt  t                                t t  t                t t r           t              t t         t     t      r              r      n r           

----- diversity: 0.5
----- Generating with seed: "real time data.
selecting a module assoc"
real time data.
selecting a module assoc   o rt e c r oir rrionnr eynr trs a n coorstua ittee nt roreoa rto etrt nt r monootato or r npo nr    ate es nr    otrtatot e  ta  r  ort l rts   rt    t er e, no er tte  non epantnt  r t   e a   r   tr a   toer tne o  oorvtr otnrnrrortne t  rro  rd p o ee,n