## Import packages

In [None]:
import datetime
import numpy as np
import pandas as pd
from rdflib import Graph
from rdflib.namespace import RDF, DC, SKOS
from rdflib.namespace import Namespace
from rdflib.term import URIRef
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Input, Embedding, Dense, GRU, Dropout, Reshape, Merge, Bidirectional
from keras.callbacks import Callback, ModelCheckpoint
from sklearn.manifold import TSNE

## Initialize global variables

In [None]:
GLOVE_FILE = 'glove.6B.300d.txt'
LCSH_SUBJECTS_FILE = 'lcsh_subjects.csv'
CONCEPTS_FILE = 'concepts.csv'
CONCEPT_EMB_VIZ_FILE = 'lcsh_emb_viz.csv'
MODEL_WEIGHTS_FILE = 'weights.h5'
CONCEPT_EMBEDDINGS_FILE = 'lcsh_embeddings.txt'

MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 250000
MAX_NB_CONCEPTS = 500000
EMBEDDING_DIM = 300

RNG_SEED_1 = 1446557
RNG_SEED_2 = 1337603
VALIDATION_SPLIT = 0.1

## Download and clean up LCSH data

In [None]:
! wget -q -O authoritiessubjects.nt.skos.gz http://id.loc.gov/static/data/authoritiessubjects.nt.skos.gz
! gunzip -q -f authoritiessubjects.nt.skos.gz
! sed -i '' 's/@EN \./@en \./g' authoritiessubjects.nt.skos

## Import LCSH linked data into in-memory graph

In [None]:
g = Graph()
print("Started", datetime.datetime.now())
g.parse("authoritiessubjects.nt.skos", format="nt")
print("  Ended", datetime.datetime.now())

## Extract concepts and labels from graph

In [None]:
print("Started", datetime.datetime.now())
concept_labels = []
for concept in g.subjects(RDF.type, SKOS.Concept):
    if type(concept) == URIRef:
        for label in g.objects(concept, SKOS.prefLabel):
            concept_labels.append([ str(concept), str(label) ])
        for label in g.objects(concept, SKOS.altLabel):
            concept_labels.append([ str(concept), str(label) ])
print("  Ended", datetime.datetime.now())

## Save concepts and labels to file

In [None]:
df = pd.DataFrame(concept_labels, columns=['concept', 'description'])
df.to_csv(LCSH_SUBJECTS_FILE, sep='\t')

## Load concepts into training dataset

In [None]:
df = pd.read_csv(LCSH_SUBJECTS_FILE, sep='\t', usecols=['concept', 'description'])

In [None]:
df[df['description'].str.contains('Tobruk')]

In [None]:
neg_df = pd.DataFrame({'concept': df['concept'].values,
                       'description': df.sample(frac=1, random_state=RNG_SEED_1)['description'].values})

In [None]:
df['label'] = 1
neg_df['label'] = 0
training_data = pd.concat([df, neg_df]).sample(frac=1, random_state=RNG_SEED_2)

In [None]:
training_data.head(10)

In [None]:
grouped = df.groupby(['concept'])['description'].apply(lambda x: ', '.join(x))
concepts = pd.DataFrame({'concept': grouped.index, 'description': grouped.values})

print('Concepts: %d' % len(concepts))

In [None]:
concepts.to_csv(CONCEPTS_FILE, sep='\t')

In [None]:
concepts = pd.read_csv(CONCEPTS_FILE, sep='\t', usecols=['concept', 'description'])

In [None]:
concepts.head(10)

## Build concept and description word indexes

In [None]:
concept_index= {}
for row in concepts.iterrows():
    concept_index[row[1]['concept']] = row[0]
nb_concepts = min(MAX_NB_CONCEPTS, len(concept_index))
    
print("Concepts in index: %d" % nb_concepts)

In [None]:
descriptions = training_data['description'].values
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(descriptions.tolist())
desc_word_sequences = tokenizer.texts_to_sequences(descriptions.tolist())
word_index = tokenizer.word_index
nb_words = min(MAX_NB_WORDS, len(word_index))

print("Words in index: %d" % len(nb_words))

## Download GloVe word embeddings

In [None]:
! wget -q http://nlp.stanford.edu/data/glove.6B.zip
! unzip -q -o glove.6B.zip
! rm -f glove.6B.zip glove.6B.50d.txt glove.6B.100d.txt glove.6B.200d.txt

## Load GloVe word embeddings

In [None]:
embeddings_index = {}
with open(GLOVE_FILE) as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

## Prepare word embedding matrix

In [None]:
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        word_embedding_matrix[i] = embedding_vector
        
print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

## Prepare training data tensors

In [None]:
c_data = np.array([ concept_index[c] for c in training_data['concept'].values ])
d_data = pad_sequences(desc_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array([ [0, 1] if l == 0 else [1, 0] for l in training_data['label'].values ])

print('Shape of concept tensor:', c_data.shape)
print('Shape of description tensor:', d_data.shape)
print('Shape of label tensor:', labels.shape)

## Define model

In [None]:
nb_concepts = min(MAX_NB_CONCEPTS, len(concept_index))

P = Sequential()
P.add(Embedding(nb_concepts + 1, EMBEDDING_DIM, input_length=1))
P.add(Reshape((EMBEDDING_DIM,)))
Q = Sequential()
Q.add(Embedding(nb_words + 1, 
                EMBEDDING_DIM, 
                weights=[word_embedding_matrix], 
                input_length=MAX_SEQUENCE_LENGTH, 
                trainable=False))
Q.add(Bidirectional(GRU(EMBEDDING_DIM, dropout_W=0.5, dropout_U=0.5), merge_mode='sum'))
model = Sequential()
model.add(Merge([P, Q], mode='concat'))
model.add(Dropout(0.5))
model.add(Dense(EMBEDDING_DIM*2, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

## Train model

In [None]:
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_categorical_accuracy', save_best_only=True)]

print("Starting training at", datetime.datetime.now())

history = model.fit([c_data, d_data], 
                    labels, 
                    nb_epoch=40, 
                    validation_split=VALIDATION_SPLIT, 
                    verbose=1, 
                    callbacks=callbacks)

print("Training ended at", datetime.datetime.now())

## Plot accuracy and loss

In [None]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                    'training': history.history['categorical_accuracy'],
                    'validation': history.history['val_categorical_accuracy']})
ax = acc.ix[:,:].plot(x='epoch', figsize={7,10}, grid=True)
ax.set_ylabel("categorical accuracy")
ax.set_ylim([0.0,1.0]);

In [None]:
loss = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                     'training': history.history['loss'],
                     'validation': history.history['val_loss']})
ax = loss.ix[:,:].plot(x='epoch', figsize={7,10}, grid=True)
ax.set_ylabel("loss")
ax.set_ylim([0.0,2.0]);

## Extract and save to file concept embeddings from best model checkpoint

In [None]:
model.load_weights(MODEL_WEIGHTS_FILE)
weights = P.layers[0].get_weights()[0]
embeddings = pd.DataFrame(weights[1:])
embeddings = pd.concat([concepts['concept'], embeddings], axis=1)

embeddings.to_csv(CONCEPT_EMBEDDINGS_FILE, sep=' ', header=False, index=False)

## Plot and save to file t-SNE visualization

In [None]:
tsne2 = TSNE(n_components=2, perplexity=30, init='pca', n_iter=5000)
fit = tsne2.fit_transform(weights)
visualization = pd.DataFrame(fit[1:], columns=['x', 'y'])
visualization.plot('x', 'y', kind='scatter', figsize={7,10}, grid=True);

visualization.to_csv(CONCEPT_EMB_VIZ_FILE)