This code is adapted from the Keras tutorial on using pre-trained word embeddings.

In [1]:
'''
GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)

'''

import os
import sys
import numpy as np
import pickle
import random

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

from patentdata.corpus import USPublications
# Probably need to move the patentcorpus.py file into the main patentdata directory
from patentdata.models.patentcorpus import LazyPatentCorpus

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Using TensorFlow backend.


Questions:
- What is our maximum word length? This becomes our maximum sequence length.  
*English words max out at around 30 characters with most words (95%+) being less than 15 in length.*
- How many words in a typical set of (patent) documents are out of dictionary?

So currently most word embedding takes a one-hot vector or dictionary index of a word and converts it into an n-dimensional vector (where n=50-300).

One issue with this technique is out of dictionary or rare words. Sometimes these are typos. Sometimes they contain information.

One question is can we output our word embeddings as the end hidden state of an RNN? This appears similar to the charLSTM described here: https://aclweb.org/anthology/D16-1157 .

In [2]:
BASE_DIR = ''
GLOVE_DIR = BASE_DIR + 'glove.6B/'

We will start with the smallest GloVe vector (50d) and work up to larger dimensions as we test.

In [3]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt'), 'rb') # Added 'rb' for Python 3
for line in f:
    values = line.decode('utf-8').split() # Added decode for Python 3
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [4]:
embeddings_index['the']

array([  4.18000013e-01,   2.49679998e-01,  -4.12420005e-01,
         1.21699996e-01,   3.45270008e-01,  -4.44569997e-02,
        -4.96879995e-01,  -1.78619996e-01,  -6.60229998e-04,
        -6.56599998e-01,   2.78430015e-01,  -1.47670001e-01,
        -5.56770027e-01,   1.46579996e-01,  -9.50950012e-03,
         1.16579998e-02,   1.02040000e-01,  -1.27920002e-01,
        -8.44299972e-01,  -1.21809997e-01,  -1.68009996e-02,
        -3.32789987e-01,  -1.55200005e-01,  -2.31309995e-01,
        -1.91809997e-01,  -1.88230002e+00,  -7.67459989e-01,
         9.90509987e-02,  -4.21249986e-01,  -1.95260003e-01,
         4.00710011e+00,  -1.85939997e-01,  -5.22870004e-01,
        -3.16810012e-01,   5.92130003e-04,   7.44489999e-03,
         1.77780002e-01,  -1.58969998e-01,   1.20409997e-02,
        -5.42230010e-02,  -2.98709989e-01,  -1.57490000e-01,
        -3.47579986e-01,  -4.56370004e-02,  -4.42510009e-01,
         1.87849998e-01,   2.78489990e-03,  -1.84110001e-01,
        -1.15139998e-01,

In [5]:
list(embeddings_index.keys())[0:100]

['pierotti',
 'thie',
 'glenda',
 'cohocton',
 '1985-1993',
 'lavazza',
 'eliatamby',
 'sabarno',
 'mohammadpur',
 'zuria',
 'cted',
 'co-ruler',
 'rebelling',
 'duma',
 'hajia',
 'kryszalowicz',
 'camiones',
 'atlanta-based',
 'silvermane',
 'flügge',
 'basica',
 'remek',
 'physiatrists',
 'auvsi',
 'kulthum',
 '1879',
 'tawang',
 'darvill',
 'potsdamer',
 'sitte',
 'fonthill',
 'stf',
 'kawaihae',
 'nbc',
 'biocompatible',
 'interwar',
 'riversharks',
 'breds',
 '64.22',
 'rvs',
 'best-of-five',
 'djordje',
 'kagetora',
 'arisa',
 'musala',
 'kaklamanakis',
 'tartt',
 'tchicaya',
 'byte',
 'bayou',
 'intuitiveness',
 'edis',
 'female-only',
 'walternate',
 'pasteurized',
 'feare',
 'murambi',
 '1865',
 '27,250',
 'tracings',
 'ndt',
 'jetmaker',
 'eponyms',
 'bielawy',
 'aufdenblatten',
 'dumaine',
 'tsakonian',
 'doyle-murray',
 'cubbie',
 'resubmitting',
 'ivanovitch',
 'hieber',
 '38,900',
 'community-level',
 'okuyama',
 'kukla',
 'elabed',
 'non-unionized',
 '0-for-8',
 'ryszard

Embeddings are all lower case. In this list of 100, most are not relevant. It would be interesting to see how many patent words are in the dictionary and how many out of dictionary terms we have. We may want to generate a patent-specific set of embeddings.

In [6]:
# Load our list of G06 records
PIK = "G06records.data"

if os.path.isfile(PIK):
    with open(PIK, "rb") as f:
        print("Loading data")
        records = pickle.load(f)
        print("{0} records loaded".format(len(records)))
else:
    records = ds.get_records(["G", "06"])
    with open(PIK, "wb") as f:
        pickle.dump(records, f)
        
# Get data from 100 random records across the data
records_random_sample = random.sample(records, 100)
print("Random sample of {0} records".format(len(records_random_sample)))
print(records_random_sample[0:5])

# Initialise datasource that lazily loads the document data
path = '/media/SAMSUNG1/Patent_Downloads'
ds = USPublications(path)

lzy = LazyPatentCorpus()
lzy.init_by_filenames(ds, records_random_sample)

Loading data
554570 records loaded
Random sample of 100 records
[(2530564, '2010/I20100923.ZIP', 'project/pdds/ICEApplication/I20100923/UTIL0242/US20100242046A1-20100923.ZIP'), (2775521, '2011/I20110623.tar', 'I20110623/UTIL0153/US20110153778A1-20110623.ZIP'), (1946766, '2008/I20081211.ZIP', 'project/pdds/ICEApplication/I20081211/UTIL0307/US20080307084A1-20081211.ZIP'), (2840466, '2011/I20110908.tar', 'I20110908/UTIL0218/US20110218704A1-20110908.ZIP'), (2601001, '2010/I20101209.tar', 'I20101209/UTIL0312/US20100312484A1-20101209.ZIP')]


In [7]:
# See if we have a saved copy of stats for 10,000 records else create it
# Note stats may be for a different set of 10,000 records
filename = "first10kg06.pkl"

if os.path.isfile(filename):
    with open(filename, "rb") as f:
        print("Loading data")
        stats = pickle.load(f)
        print("Stats loaded")
else:
    stats = lzy.get_statistics()
    with open(filename, "wb") as f:
        pickle.dump(stats, f)

Loading data
Stats loaded


Can get an unfiltered token dictionary as first item in lzy.get_statistics() (unfiltered_vocabulary).  

This will have a mixture of upper and lower case. We can thus post process to combine upper and lower case entries. 

In [8]:
def lowerise(counter_in):
    """ Convert a term counter in mixed case to a counter in lower case.
        Combine any entries that are the same after removing case.
    """
    filtered_counter = Counter()
    for i, token in enumerate(counter_in.keys()):
        if token.lower() in filtered_counter.keys():
            filtered_counter[token.lower()] += counter_in[token]
        else:
            filtered_counter[token.lower()] = counter_in[token]
        if (i%100) == 0:
            logging.info("Processing {0}th token - {1}".format(i, token))
    return filtered_counter

In [9]:
from collections import Counter
test_list = ['a', 'a', 'a', 'b', 'b', 'c', 'A', 'A', 'C']
c = Counter(test_list)
print(c)
c2 = lowerise(c)
print(c2)

Counter({'a': 3, 'A': 2, 'b': 2, 'C': 1, 'c': 1})
Counter({'a': 5, 'b': 2, 'c': 2})


In [10]:
stats[0]

Counter({'communize': 2,
         'maintenance/check': 1,
         '1,108,125': 2,
         'S1218': 12,
         'ADF': 123,
         'Mixer/amplifier': 1,
         'L2V0F2': 2,
         '35.489': 1,
         '72.884': 1,
         '5,987,425': 1,
         'DoseIndex_For_Case_xxx': 2,
         '8.52': 1,
         'fiis': 1,
         '230B': 57,
         'cted': 1,
         'siganalname3': 1,
         'media/transport': 4,
         'tobe': 4,
         'Korfmacher': 2,
         '0k': 3,
         'FrameOvf': 1,
         'operation/processing': 1,
         '60/265,875': 1,
         '1879': 13,
         'positioners': 34,
         'ST826': 1,
         'IWBWrap1': 2,
         '1/RC': 1,
         '09/327,966': 2,
         'nanoprocessors': 4,
         'Subscriptions': 21,
         'old_posM1': 1,
         'row-pulse': 2,
         'pricelist': 2,
         'Cherokee': 5,
         'codeCWC001.class': 1,
         'same-direction': 1,
         'Characters': 94,
         '13X.XXX.2.13': 1,
        

For initial learning - maybe take 100,000 most common tokens as data is very noisy.

Issues:
- Lots of numbers as separate tokens;
    - Also mixture of digits and symbols e.g. '7\*29,352\*29' or '3-5-3' or '0..255'.
    - Patent numbers of various types.
    - Reference numerals such as '100', '74a', '300a'.
- Variable names such as 'c1C.sub.r' or 'cfgFile';
- Web addresses or paths such as '//sahara.biospace.com/idev_cybermall/plsql'.

Do we filter these out of the learning?

Most of these are dataset specific.

May want to perform the following processing:
- punctuation split;
- replace patent numbers.

Can we use POS tags to identify numbers?

Using embeddings for numbers seems foolish - each number is treated as a separate token. But embedding as \_NUMBER\_ also seems to lose some of the information in the number.


In [11]:
unique_tokens = list(stats[0].keys())
print(unique_tokens[0:20])

print(len(stats[0].keys()))

['communize', 'maintenance/check', '1,108,125', 'S1218', 'ADF', 'Mixer/amplifier', 'L2V0F2', '35.489', '72.884', '5,987,425', 'DoseIndex_For_Case_xxx', 'feedback/measurements', 'support/laser', 'CurMorph.vert_arr', 'cted', 'siganalname3', 'media/transport', 'sender-information', 'Korfmacher', '0k']
464724


In [12]:
lower_counts = lowerise(stats[0])

In [13]:
lower_counts

Counter({'request.form': 3,
         'communize': 2,
         'maintenance/check': 1,
         '1,108,125': 2,
         'ft/f2': 3,
         'yhi': 1,
         'medical/health': 3,
         '35.489': 1,
         '72.884': 1,
         '5,987,425': 1,
         'feedback/measurements': 1,
         'welbilt': 2,
         'cted': 1,
         'siganalname3': 1,
         'media/transport': 4,
         'tobe': 4,
         'dt2e': 3,
         'animationnode': 2,
         '0k': 6,
         'ho-4': 7,
         's1425': 7,
         'operation/processing': 1,
         '60/265,875': 1,
         '1879': 13,
         'positioners': 34,
         'drivera': 1,
         'systemout': 1,
         '15v': 2,
         'nbc': 54,
         '09/327,966': 2,
         'nanoprocessors': 4,
         '5,734,152': 1,
         'pricelist': 2,
         'rvs': 3,
         'requiredresources': 6,
         'same-direction': 1,
         '49.607': 2,
         "billers'statements": 1,
         'causal-based': 1,
         'nt*

In [14]:
print('the' in stats[0])
print('The' in stats[0])
print('THe' in stats[0])
print('THE' in stats[0])
print(stats[0]['the'])
print(stats[0]['The'])
print(stats[0]['THe'])
print(stats[0]['THE'])

print('the' in lower_counts)
print('The' in lower_counts)
print('THe' in lower_counts)
print('THE' in lower_counts)
print(lower_counts['the'])


True
True
True
True
7683723
800541
2
1116
True
False
False
False
8485384


In [15]:
7683723+800541+2+1116

8485382

In [16]:
print("Number of original mixed case tokens: {0}".format(len(stats[0])))
print("Number of lower case tokens: {0}".format(len(lower_counts)))

Number of original mixed case tokens: 464724
Number of lower case tokens: 410697


In [19]:
total = 0
shared_terms = list()
for k in lower_counts.keys():
    if k in embeddings_index.keys():
        total += 1
        shared_terms.append(k)

print("There are {0} terms in our patent count that are in the GloVe embeddings".format(total))
print("There are {0} terms in our patent count that are NOT in the GloVe embeddings".format(len(lower_counts)-total))
print("A sample of shared terms: {0}".format(random.sample(shared_terms, 100)))

There are 77364 terms in our patent count that are in the GloVe embeddings
There are 333333 terms in our patent count that are NOT in the GloVe embeddings
A sample of shared terms: ['46.57', '4.45', 'x87', 'ambiguity', 'up-down', 'diadem', 'decommissioning', 'gof', 'affluent', 'vhf', '1476', 'assimilated', 'aprs', '43.35', 'tmds', 're-appear', 'lsn', 'rollerblade', 'illusion', 'smg', 'lc3', 'dvd-ram', 'avows', 'clan', 'telemarketer', '91.7', 'hinote', 'grail', 'ajka', 'otcbb', 'locater', 'krawczyk', 'impermissible', 'cushioning', 'abc', 'plunkett', '2890', 're-writing', 'am1', '2220', 'slamon', 'gritz', '83.58', 'rinsing', 'edmondson', 'branching', 'alcs', 'substrings', 'counteroffers', 'rans', '050', 'immature', 'reduces', 'arash', 'power-law', 'etf', '1737', 'creatives', '60.57', 'wounds', 'warsaw', 'xray', 'agronomist', 'comprehend', 'duels', 'limitless', 'conformably', 'convolutional', 'serendipitous', 'buckle', '.1', 'duryea', 'protesting', 'formulary', 'autoclave', 'cryptosystems

In [20]:
random.sample(shared_terms, 100)

['rifai',
 'amalgams',
 'wash',
 'airfare',
 'liquified',
 'demographically',
 'delimiter',
 'cuvette',
 'succinic',
 'plf',
 'open-source',
 'visitable',
 'fulfillment',
 'am3',
 'ya',
 'scpc',
 'solidification',
 'coordinator',
 'v-4',
 '2216',
 'svr',
 'torok',
 'decision-makers',
 'redundantly',
 'hla',
 'soeder',
 'one-by-one',
 'plug-and-play',
 '42.92',
 'color-coded',
 '45.14',
 'seafood',
 'levinthal',
 '125-1',
 'beli',
 'become',
 'pusic',
 'synergies',
 'similar',
 'psos',
 'pecuniary',
 'brinch',
 'unravels',
 'bathes',
 '2181',
 'embraces',
 'vois',
 'hp-ux',
 'lrs',
 'palettes',
 'provo',
 'indispensable',
 'equitation',
 'vil',
 'expense',
 'fbs',
 'arty',
 'everywhere',
 'pilot',
 '93.33',
 'symptomatic',
 'undisputed',
 'climates',
 'expectancies',
 'kraemer',
 'well-timed',
 'res',
 'lace',
 '72.1',
 '5-0',
 'tempered',
 'puccini',
 'caglayan',
 'overtones',
 'masaoka',
 'kotler',
 'mbw',
 'dispatching',
 'offset',
 'cautiously',
 'ofe',
 '80c',
 'blooded',
 'yoshiha

In [21]:
lower_counts.most_common(1)

[('the', 8485384)]

In [23]:
# How many of the shared terms are in the most common patent term counts
total = 0
common_terms = list()
for term, count in lower_counts.most_common(100000):
    if term in embeddings_index.keys():
        total += 1
        common_terms.append(term)
        
print("There are {0} of the top 100,000 terms in our patent count that are in the GloVe embeddings".format(total))

There are 45298 of the top 100,000 terms in our patent count that are in the GloVe embeddings


It looks like we will need to generate our own set of word embeddings! Maybe we can use gensim to do this.

However, can we use the overlapping terms to seed our weights?

In [24]:
common_terms

['the',
 ',',
 '.',
 'a',
 'of',
 'to',
 'and',
 'in',
 'is',
 'for',
 ')',
 '(',
 'data',
 'be',
 'an',
 'as',
 'by',
 'or',
 'that',
 'information',
 'with',
 'are',
 'from',
 'on',
 'fig',
 'system',
 'said',
 'which',
 ';',
 'at',
 'may',
 'user',
 'can',
 'one',
 'this',
 'step',
 'invention',
 'image',
 'each',
 'it',
 '1',
 'device',
 'not',
 'first',
 'method',
 'such',
 'if',
 'present',
 'processing',
 'when',
 'server',
 'wherein',
 'computer',
 'memory',
 'unit',
 'claim',
 'embodiment',
 ':',
 'other',
 'example',
 'control',
 'second',
 'means',
 'according',
 'number',
 'signal',
 'has',
 'network',
 'time',
 'process',
 'used',
 '2',
 'will',
 'program',
 'value',
 'shown',
 'also',
 'input',
 'set',
 'then',
 'further',
 'apparatus',
 'object',
 'display',
 'more',
 'address',
 'using',
 'into',
 'operation',
 'stored',
 'storage',
 'application',
 'code',
 "'s",
 'described',
 'block',
 '3',
 'between',
 'output',
 'includes',
 'circuit',
 'any',
 'plurality',
 'servi

In [None]:
#TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [None]:
# Replace below with lines to get patent data



# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

In [None]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
# Replace word_index 

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [None]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Training model.')

In [None]:
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))