## Load data

In [1]:
import json
from itertools import chain
from collections import Counter, OrderedDict
import numpy as np
import random
from keras.layers import Input, Embedding, Dot, Reshape, Dense
from keras.models import Model

Using TensorFlow backend.


In [2]:
raw_data = '../data/found_items_filtered.ndjson'

In [3]:
books = []

with open(raw_data, 'r') as fin:
    # Append each line to the books
    books = [json.loads(l) for l in fin]

# Remove non-book articles
books_with_wikipedia = [book for book in books if 'Wikipedia:' in book[0]]
books = [book for book in books if 'Wikipedia:' not in book[0]]

print(f'Found {len(books)} books.')

Found 4661 books.


In [4]:
n = 21
books[n][0], books[n][1], books[n][2][:5], books[n][3][:5], books[n][4], books[n][5]
# 0 title
# 1 information from the Infobox template
# 2 internal wikipedia links
# 3 external links
# 4 date of last edit
# 5 number of characters in the article (a rough estimate of the length of the article)

('WinAPE',
 {'nom': 'WinAPE',
  'développeur': 'Richard Wilson',
  'environnement': 'MS Windows',
  'langues': 'anglais',
  'type': 'Émulateur',
  'licence': 'gratuiciel',
  'site web': 'www.winape.net'},
 ['Microsoft Windows',
  'Émulateur',
  'gratuiciel',
  'Microsoft Windows',
  'Amstrad CPC'],
 ['http://www.winape.net',
  'http://www.clubic.com/telecharger-fiche14853-winape.html',
  'http://cpcrulez.fr/emulateurs_download-WIN-WINAPE.htm',
  'http://cpcrulez.fr/emulateurs_download-WIN-WINAPE.htm',
  'http://www.dosgamers.com/amstrad-cpc/winape-amstrad-emulator'],
 '2019-01-06T14:56:41Z',
 4329)

In [5]:
book_index = {book[0]: idx for idx, book in enumerate(books)}
index_book = {idx: book for book, idx in book_index.items()}

## Exploring internal links (wikilinks)

In [6]:
wikilinks = list(chain(*[book[2] for book in books]))
print(f"There are {len(set(wikilinks))} unique wikilinks.")

There are 37751 unique wikilinks.


In [7]:
wikilinks_other_books = [link for link in wikilinks if link in book_index.keys()]
print(f"There are {len(set(wikilinks_other_books))} unique wikilinks to other pages.")

There are 3073 unique wikilinks to other pages.


### Most Linked-to Articles
Let's take a look at which pages are most linked to by books on Wikipedia.

We'll make a utility function that takes in a list and returns a sorted ordered dictionary of the counts of the items in the list. The collections module has a number of useful functions for dealing with groups of objects.

In [8]:
def count_items(l):
    """Return ordered dictionary of counts of objects in `l`"""
    
    # Create a counter object
    counts = Counter(l)
    
    # Sort by highest count first and place in ordered dictionary
    counts = sorted(counts.items(), key = lambda x: x[1], reverse = True)
    counts = OrderedDict(counts)
    
    return counts

In [9]:
# Find set of wikilinks for each book and convert to a flattened list
unique_wikilinks = list(chain(*[list(set(book[2])) for book in books]))
wikilinks = [link.lower() for link in unique_wikilinks]

wikilink_counts = count_items(wikilinks)
list(wikilink_counts.items())[:10]

[('logiciel libre', 1450),
 ('microsoft windows', 1395),
 ('linux', 1289),
 ('catégorie:logiciel libre sous licence gpl', 1036),
 ('licence publique générale gnu', 949),
 ('catégorie:logiciel pour windows', 880),
 ('logiciel propriétaire', 691),
 ('mac os x', 657),
 ('open source', 637),
 ('unix', 617)]

Optionnal : remove some data (the too common and the too rare)

In [10]:
# to_remove = []
# for t in to_remove:
#     wikilinks.remove(t)
#     _ = wikilink_counts.pop(t)

# Limit to greater than 3 links
# links = [t[0] for t in wikilink_counts.items() if t[1] >= 4]
links = [t[0] for t in wikilink_counts.items()]
print(len(links))

34018


In [11]:
# Find set of page wikilinks for each page (both a page and a link)
unique_wikilinks_books = list(chain(*[list(set(link for link in book[2] if link in book_index.keys())) for book in books]))

# Count the number of pages linked to by other pages
wikilink_book_counts = count_items(unique_wikilinks_books)
list(wikilink_book_counts.items())[:10]

[('Microsoft Windows', 1392),
 ('Linux', 1280),
 ('C++', 468),
 ('C (langage)', 436),
 ('Java (langage)', 408),
 ('Unix', 402),
 ('Android', 390),
 ('Python (langage)', 386),
 ('GNU', 377),
 ('JavaScript', 296)]

### Potential Additional Cleaning Step
If you want to try more data cleaning, one option would be to clean the link entities. For example, both the new york times and new york times are in the links. These could clearly be combined into a single entry because they link to the same exact page. This might require manual inspection of the links, and I decided not to do this because of the time involved!

### Wikilinks to Index
As with the books, we need to map the Wikilinks to integers. We'll also create the reverse mapping.

In [12]:
link_index = {link: idx for idx, link in enumerate(links)}
index_link = {idx: link for link, idx in link_index.items()}

### Supervised Machine Learning Task
Now that we have clean data, we'll move on to the second step: developing a supervised machine learning task to train an embedding neural network. As a reminder, we'll state the problem as: given a book title and a link, identify if the link is in the book's article.

### Build a Training Set
In order for any machine learning model to learn, it needs a training set. We are going to treat this as a supervised learning problem: given a pair (book, link), we want the neural network to learn to predict whether this is a legitimate pair - present in the data - or not.

To create a training set, for each book, we'll iterate through the wikilinks on the book page and record the book title and each link as a tuple. The final pairs list will consist of tuples of every (book, link) pairing on all of Wikipedia.

In [13]:
pairs = []

# Iterate through each book
for book in books:
    # Iterate through the links in the book
    pairs.extend((book_index[book[0]], link_index[link.lower()]) for link in book[2] if link.lower() in links)
    
len(pairs), len(links), len(books)

(165570, 34018, 4661)

In [14]:
pairs_set = set(pairs)

In [15]:
random.seed(100)

def generate_batch(pairs, n_positive = 50, negative_ratio = 1.0, classification = False):
    """Generate batches of samples for training"""
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    
    # Adjust label based on task
    if classification:
        neg_label = 0
    else:
        neg_label = -1
    
    # This creates a generator
    while True:
        # randomly choose positive examples
        for idx, (book_id, link_id) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (book_id, link_id, 1)

        # Increment idx by 1
        idx += 1
        
        # Add negative examples until reach batch size
        while idx < batch_size:
            
            # random selection
            random_book = random.randrange(len(books))
            random_link = random.randrange(len(links))
            
            # Check to make sure this is not a positive example
            if (random_book, random_link) not in pairs_set:
                
                # Add to batch and increment index
                batch[idx, :] = (random_book, random_link, neg_label)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {'book': batch[:, 0], 'link': batch[:, 1]}, batch[:, 2]

In [16]:
x, y = next(generate_batch(pairs, n_positive = 2, negative_ratio = 2))

# Show a few example training pairs
for label, b_idx, l_idx in zip(y, x['book'], x['link']):
    print(f'Page: {index_book[b_idx]:30} Link: {index_link[l_idx]:40} Label: {label}')

Page: KHTML                          Link: qcow2                                    Label: -1.0
Page: OpenOffice.org                 Link: opentoonz                                Label: -1.0
Page: Qt                             Link: python (langage)                         Label: 1.0
Page: PowerDVD                       Link: ukui                                     Label: -1.0
Page: VirtueMart                     Link: récepteur nucléaire                      Label: -1.0
Page: Wpa supplicant                 Link: catégorie:logiciel pour linux            Label: 1.0


In [17]:
len(link_index)

34018

In [18]:
def book_embedding_model(embedding_size = 50, classification = False):
    """Model to embed books and wikilinks using the functional API.
       Trained to discern if a link is present in a article"""
    
    # Both inputs are 1-dimensional
    book = Input(name = 'book', shape = [1])
    link = Input(name = 'link', shape = [1])
    
    # Embedding the book (shape will be (None, 1, 50))
    book_embedding = Embedding(name = 'book_embedding',
                               input_dim = len(book_index),
                               output_dim = embedding_size)(book)
    
    # Embedding the link (shape will be (None, 1, 50))
    link_embedding = Embedding(name = 'link_embedding',
                               input_dim = len(link_index),
                               output_dim = embedding_size)(link)
    
    # Merge the layers with a dot product along the second axis (shape will be (None, 1, 1))
    merged = Dot(name = 'dot_product', normalize = True, axes = 2)([book_embedding, link_embedding])
    
    # Reshape to be a single number (shape will be (None, 1))
    merged = Reshape(target_shape = [1])(merged)
    
    # If classifcation, add extra layer and loss function is binary cross entropy
    if classification:
        merged = Dense(1, activation = 'sigmoid')(merged)
        model = Model(inputs = [book, link], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    # Otherwise loss function is mean squared error
    else:
        model = Model(inputs = [book, link], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'mse')
    
    return model

In [19]:
models_dir = '../models/'

save_emb_dir = '../saved_embeddings/items/'

### Regression model

In [20]:
n_positive = 1024

In [21]:
# Instantiate model and show parameters
model = book_embedding_model(50)
gen = generate_batch(pairs, n_positive, negative_ratio = 2)
h = model.fit_generator(gen, epochs = 11, steps_per_epoch = len(pairs) // n_positive, verbose = 2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/11
 - 4s - loss: 0.9822
Epoch 2/11
 - 4s - loss: 0.9117
Epoch 3/11
 - 4s - loss: 0.7621
Epoch 4/11
 - 4s - loss: 0.5807
Epoch 5/11
 - 4s - loss: 0.5053
Epoch 6/11
 - 4s - loss: 0.4820
Epoch 7/11
 - 4s - loss: 0.4374
Epoch 8/11
 - 4s - loss: 0.4248
Epoch 9/11
 - 4s - loss: 0.4205
Epoch 10/11
 - 4s - loss: 0.4360
Epoch 11/11
 - 4s - loss: 0.4086


### Classification model

In [25]:
model_class = book_embedding_model(50, classification = True)
gen = generate_batch(pairs, n_positive, negative_ratio=2, classification = True)
h = model_class.fit_generator(gen, epochs = 20, steps_per_epoch= len(pairs) // n_positive, verbose = 2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/20
 - 3s - loss: 0.6730 - accuracy: 0.6025
Epoch 2/20
 - 4s - loss: 0.6165 - accuracy: 0.7668
Epoch 3/20
 - 4s - loss: 0.5263 - accuracy: 0.8479
Epoch 4/20
 - 4s - loss: 0.4450 - accuracy: 0.8485
Epoch 5/20
 - 3s - loss: 0.3808 - accuracy: 0.8723
Epoch 6/20
 - 3s - loss: 0.3687 - accuracy: 0.8733
Epoch 7/20
 - 4s - loss: 0.3459 - accuracy: 0.8862
Epoch 8/20
 - 5s - loss: 0.3402 - accuracy: 0.8930
Epoch 9/20
 - 4s - loss: 0.3558 - accuracy: 0.8822
Epoch 10/20
 - 4s - loss: 0.3213 - accuracy: 0.9063
Epoch 11/20
 - 4s - loss: 0.3256 - accuracy: 0.9013
Epoch 12/20
 - 4s - loss: 0.3319 - accuracy: 0.8966
Epoch 13/20
 - 3s - loss: 0.2683 - accuracy: 0.9346
Epoch 14/20
 - 4s - loss: 0.3330 - accuracy: 0.8975
Epoch 15/20
 - 3s - loss: 0.2954 - accuracy: 0.9179
Epoch 16/20
 - 3s - loss: 0.2462 - accuracy: 0.9431
Epoch 17/20
 - 4s - loss: 0.2796 - accuracy: 0.9248
Epoch 18/20
 - 3s - loss: 0.2830 - accuracy: 0.9220
Epoch 19/20
 - 3s - loss: 0.2479 - accuracy: 0.9401
Epoch 20/20
 - 3s - l

In [23]:
def extract_weights(name, model):
    """Extract weights from a neural network model"""
    
    # Extract weights
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]
    
    # Normalize
    weights = weights / np.linalg.norm(weights, axis = 1).reshape((-1, 1))
    return weights

In [24]:
def save_all(model, model_name, save_emb_dir, pages, links):
    # save model
    model.save(models_dir + model_name)
    # save page embedding
    np.savetxt(save_emb_dir + 'page_embedding.tsv', extract_weights('book_embedding', model), delimiter='\t')
    # save page names
    save_as_tsv(save_emb_dir + 'page_names.tsv', pages)
    # save links embedding
    np.savetxt(save_emb_dir + 'link_embedding.tsv', extract_weights('link_embedding', model), delimiter='\t')
    # save links names
    save_as_tsv(save_emb_dir + 'link_names.tsv', links)

def save_as_tsv(path, data):
    with open(path, 'w' , encoding = 'utf-8') as f:
        for l in data:
            f.write(str(l) + '\n')

In [26]:
pages = book_index.keys()
links = link_index.keys()

save_all(model, 'first_attempt_items.h5', save_emb_dir, pages, links)

In [27]:
pages = book_index.keys()
links = link_index.keys()

save_all(model_class, 'first_attempt_class_items.h5', save_emb_dir, pages, links)