In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%env KAGGLE_USERNAME=
%env KAGGLE_KEY=

In [None]:
!kaggle datasets download -d danielwillgeorge/glove6b100dtxt -p data/ --unzip

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

plt.style.use("fivethirtyeight")


In [None]:
RANDOM_STATE = 50
EPOCHS = 150
BATCH_SIZE = 2048
TRAINING_LENGTH = 50
TRAIN_FRACTION = 0.7
LSTM_CELLS = 64
VERBOSE = 1
SAVE_MODEL = True

In [None]:
data = pd.read_csv("data/machine-learning.csv", parse_dates=['patent_date'])
data = data.dropna()
len(data)


In [None]:
from datetime import datetime

data["year-month"] = [
    datetime(year, month, 1)
    for year, month in zip(
        data["patent_date"].dt.year,
        data["patent_date"].dt.month,
    )
]
monthly = data.groupby("year-month")["patent_number"].count()
monthly.plot(figsize=(16, 8))
plt.ylabel("Number of patents")
plt.xlabel("Date")
plt.title("Machine learning patents over time")


In [None]:
data["year"] = [year for year in data["patent_date"].dt.year]
yearly = data.groupby("year")["patent_number"].count()
yearly.plot.bar(
    color="red",
    edgecolor="k",
    figsize=(16, 8),
)
plt.xlabel("Year")
plt.ylabel("Number of Patents")
plt.title("Neural Network patents by year")


In [None]:
import re


def format_patent(patent):
    """Add spaces around punctuation and remove references to images/citations."""

    # Add spaces around punctuation
    patent = re.sub(r"(?<=[^\s0-9])(?=[.,;?])", r" ", patent)

    # Remove references to figures
    patent = re.sub(r"\((\d+)\)", r"", patent)

    # Remove double spaces
    patent = re.sub(r"\s\s", " ", patent)
    return patent


def remove_spaces(patent):
    """Remove spaces around punctuation"""
    patent = re.sub(r"\s+([.,;?])", r"\1", patent)

    return patent


In [None]:
original_abstracts = data["patent_abstract"].to_list()
formatted_abstracts = []

for abstract in original_abstracts:
    abstract = format_patent(abstract)
    formatted_abstracts.append(abstract)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer


def make_sequences(
    texts,
    training_length=50,
    lower=True,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
):
    """Turn a set of texts into sequences of integers"""

    # Create the tokenizer object and train on texts
    tokenizer = Tokenizer(lower=lower, filters=filters)
    tokenizer.fit_on_texts(texts)

    # Create look-up dictionaries and reverse look-ups
    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts

    print(f"There are {num_words} unique words.")

    # Convert text to sequences of integers
    sequences = tokenizer.texts_to_sequences(texts)

    # Limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [
        i for i, l in enumerate(seq_lengths) if l > (training_length + 20)
    ]

    new_texts = []
    new_sequences = []

    # Only keep sequences with more than training length tokens
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])

    training_seq = []
    labels = []

    # Iterate through the sequences of tokens
    for seq in new_sequences:

        # Create multiple training examples from each sequence
        for i in range(training_length, len(seq)):
            # Extract the features and label
            extract = seq[i - training_length : i + 1]

            # Set the features and label
            training_seq.append(extract[:-1])
            labels.append(extract[-1])

    print(f"There are {len(training_seq)} training sequences.")

    # Return everything needed for setting up the model
    return (
        word_idx,
        idx_word,
        num_words,
        word_counts,
        new_texts,
        new_sequences,
        training_seq,
        labels,
    )


In [None]:
filters = '!"#$%&()*+/:<=>@[\\]^_`{|}~\t\n'

(
    word_idx,
    idx_word,
    num_words,
    word_counts,
    abstracts,
    sequences,
    features,
    labels,
) = make_sequences(
    formatted_abstracts,
    TRAINING_LENGTH,
    lower=True,
    filters=filters,
)


In [None]:
from sklearn.utils import shuffle


def create_train_valid(
    features, labels, num_words, train_fraction=TRAIN_FRACTION
):
    """Create training and validation features and labels."""

    # Randomly shuffle features and labels
    features, labels = shuffle(features, labels, random_state=RANDOM_STATE)

    # Decide on number of samples for training
    train_end = int(train_fraction * len(labels))

    train_features = np.array(features[:train_end])
    valid_features = np.array(features[train_end:])

    train_labels = labels[:train_end]
    valid_labels = labels[train_end:]

    # Convert to arrays
    X_train, X_valid = np.array(train_features), np.array(valid_features)

    # Using int8 for memory savings
    y_train = np.zeros((len(train_labels), num_words), dtype=np.int8)
    y_valid = np.zeros((len(valid_labels), num_words), dtype=np.int8)

    # One hot encoding of labels
    for example_index, word_index in enumerate(train_labels):
        y_train[example_index, word_index] = 1

    for example_index, word_index in enumerate(valid_labels):
        y_valid[example_index, word_index] = 1

    # Memory management
    import gc

    gc.enable()
    del (
        features,
        labels,
        train_features,
        valid_features,
        train_labels,
        valid_labels,
    )
    gc.collect()

    return X_train, X_valid, y_train, y_valid


In [None]:
X_train, X_valid, y_train, y_valid = create_train_valid(
    features,
    labels,
    num_words,
)


In [None]:
X_train.shape, y_train.shape

In [None]:
glove = np.loadtxt("data/glove.6B.100d.txt", dtype='str', comments=None)
glove.shape

In [None]:
import sys

def check_sizes(gb_min=1):
    for x in globals():
        size = sys.getsizeof(eval(x)) / 1e9
        if size > gb_min:
            print(f'Object: {x:10}\tSize: {size} GB.')


check_sizes(gb_min=1)

In [None]:
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]

In [None]:
word_lookup = {word: vector for word, vector in zip(words, vectors)}

embedding_matrix = np.zeros((num_words, vectors.shape[1]))

not_found = 0

for idx, word in idx_word.items():
    # Look up the word embedding
    vector = word_lookup.get(word)

    # Record in matrix
    if vector is not None:
        embedding_matrix[idx, :] = vector
    else:
        not_found += 1

print(f'There were {not_found} words without pre-trained embeddings.')

In [None]:
embedding_matrix = embedding_matrix / np.linalg.norm(embedding_matrix, axis=1).reshape((-1, 1))
embedding_matrix = np.nan_to_num(embedding_matrix)

In [None]:
def find_closest(query, embedding_matrix, word_idx, idx_word, n=10):
    """Find closest words to a query word in embeddings"""

    idx = word_idx.get(query, None)
    # Handle case where query is not in vocab
    if idx is None:
        print(f"{query} not found in vocab.")
        return
    else:
        vec = embedding_matrix[idx]
        # Handle case where word doesn't have an embedding
        if np.all(vec == 0):
            print(f"{query} has no pre-trained embedding.")
            return
        else:
            # Calculate distance between vector and all others
            dists = np.dot(embedding_matrix, vec)

            # Sort indexes in reverse order
            idxs = np.argsort(dists)[::-1][:n]
            sorted_dists = dists[idxs]
            closest = [idx_word[i] for i in idxs]

    print(f"Query: {query}\n")
    # Print out the word and cosine distances
    for word, dist in zip(closest, sorted_dists):
        print(f"Word: {word:15} Cosine Similarity: {round(dist, 4)}")


In [None]:
find_closest('the', embedding_matrix, word_idx, idx_word)

In [None]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (
    LSTM,
    Dense,
    Dropout,
    Embedding,
    Masking,
    Bidirectional,
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import Constant

from tensorflow.keras.utils import plot_model


In [None]:
def make_word_level_model(
    num_words,
    embedding_matrix,
    lstm_cells=64,
    trainable=False,
    lstm_layers=1,
    bi_direc=False,
):
    """Make a word level recurrent neural network with option for pretrained embeddings
    and varying numbers of LSTM cell layers."""

    model = Sequential()

    # Map words to an embedding
    if not trainable:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                embeddings_initializer=Constant(embedding_matrix),
                trainable=False,
                mask_zero=True,
            )
        )
        model.add(Masking())
    else:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                embeddings_initializer=Constant(embedding_matrix),
                trainable=True,
            )
        )

    # If want to add multiple LSTM layers
    if lstm_layers > 1:
        for _ in range(lstm_layers - 1):
            model.add(
                LSTM(
                    lstm_cells,
                    return_sequences=True,
                    dropout=0.1,
                    recurrent_dropout=0.1,
                )
            )

    # Add final LSTM cell layer
    if bi_direc:
        model.add(
            Bidirectional(
                LSTM(
                    lstm_cells,
                    return_sequences=False,
                    dropout=0.1,
                    recurrent_dropout=0.1,
                )
            )
        )
    else:
        model.add(
            LSTM(
                lstm_cells,
                return_sequences=False,
                dropout=0.1,
                recurrent_dropout=0.1,
            )
        )
    model.add(Dense(128, activation="relu"))
    # Dropout for regularization
    model.add(Dropout(0.5))

    # Output layer
    model.add(Dense(num_words, activation="softmax"))

    # Compile the model
    model.compile(
        optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
    )
    return model


model = make_word_level_model(
    num_words,
    embedding_matrix=embedding_matrix,
    lstm_cells=LSTM_CELLS,
    trainable=False,
    lstm_layers=1,
)
model.summary()


In [None]:
plot_model(model, show_shapes=True)

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

BATCH_SIZE = 2048

model_name = 'pre-trained-rnn'
model_dir = 'models'


def make_callbacks(model_name, save=SAVE_MODEL):
    """Make list of callbacks for training"""
    callbacks = [EarlyStopping(monitor='val_loss', patience=5)]

    if save:
        callbacks.append(
            ModelCheckpoint(
                f'{model_dir}/{model_name}.h5',
                save_best_only=True,
                save_weights_only=False))
    return callbacks


callbacks = make_callbacks(model_name)

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=VERBOSE,
    callbacks=callbacks,
    validation_data=(X_valid, y_valid))