In [1]:
import errno
import os
import tensorflow as tf
import urllib



def maybe_download(url, local_dir, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    mkdir_p(local_dir)
    local_filename = url.split('/')[-1]
    local_filepath = os.path.join(local_dir, local_filename)
    if not os.path.exists(local_filepath):
        print("Downloading %s..." % local_filename)
        local_filename, _ = urllib.request.urlretrieve(url,
                                                       local_filepath)
        print("Finished!")
    statinfo = os.stat(local_filepath)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', local_filepath)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + local_filename +
                        '. Can you get to it with a browser?')
    return local_filename


def mkdir_p(path):
    """From https://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python"""
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


def get_session():
    """Create a session that dynamically allocates memory."""
    # See: https://www.tensorflow.org/tutorials/using_gpu#allowing_gpu_memory_growth
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    session = tf.Session(config=config)
    return session

In [2]:
import zipfile

import numpy as np

maybe_download('http://nlp.stanford.edu/data/glove.6B.zip', 'datasets', 862182613)
if not os.path.exists(os.path.join("datasets", "glove.6B.50d.txt")):
    with zipfile.ZipFile(os.path.join("datasets", "glove.6B.zip"), "r") as zip_ref:
        zip_ref.extractall("datasets")
    for f in ["glove.6B.100d.txt", "glove.6B.300d.txt", "glove.6B.200d.txt"]:
        os.remove(os.path.join('datasets', f))

Found and verified datasets/glove.6B.zip


In [3]:
import os
import numpy as np
import tensorflow as tf

def load_embeddings(filename):
    vocab = []
    embed = []
    with open(filename, 'r', encoding="utf-8") as f:
        for line in f.readlines():
            row = line.strip().split(' ')
            vocab.append(row[0])
            embed.append(row[1:])
    embed = np.asarray(embed)
    return vocab, embed


# Load the GloVe vectors into numpy
glove_filepath = os.path.join('datasets', 'glove.6B.50d.txt')
vocab, embed = load_embeddings(glove_filepath)
vocab_size = len(vocab)
embed_dim = len(embed[0])
assert vocab_size > 0, "The vocabulary shouldn't be empty; did you download the GloVe weights?"
print('Loaded %d %d-dimensional embeddings.' % (vocab_size, embed_dim))

# word2id = {}
# id2word = vocab
# for i, w in enumerate(id2word):
#     word2id[w] = i

# Ops to load the embeddings into TensorFlow
# embedding = tf.Variable(tf.constant(0.0, shape=[vocab_size, embed_dim]),
#                         trainable=False, name="embedding")
# embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embed_dim])
# embedding_init = embedding.assign(embedding_placeholder)

Loaded 400000 50-dimensional embeddings.


TODO pick general categories later.

In [4]:
def get_embed_for_words(words):
    word_embeds = []
    fin_words = []
    for i in range(400000):
        for word in words:
            if word == vocab[i]: 
                print('Word {} is in the dataset'.format(vocab[i]))
                fin_words.append(word)
                word_embeds.append(embed[i])
                words.remove(vocab[i])
    return fin_words, word_embeds

In [9]:
general_categories = ['police', 'budget', 'meeting', 'election']
category_words, category_embeds = get_embed_for_words(general_categories)

Word police is in the dataset
Word meeting is in the dataset
Word election is in the dataset
Word budget is in the dataset


In [12]:
category_embeds = np.vstack(tuple(category_embeds))

In [13]:
def load_with_repeats(filename): 
    with open(filename, 'r') as f:
        all_cat = f.readline().split(',')
    return all_cat

In [16]:
tag_phrases = load_with_repeats('../categories.txt')

In [17]:
tag_words = []
for phrase in tag_phrases: 
    for x in phrase.strip().split(' '):
        tag_words.append(x)

In [19]:
tag_words, tag_embeddings = get_embed_for_words(tag_words)

Word police is in the dataset
Word police is in the dataset
Word police is in the dataset
Word police is in the dataset
Word business is in the dataset
Word law is in the dataset
Word service is in the dataset
Word plan is in the dataset
Word water is in the dataset
Word age is in the dataset
Word age is in the dataset
Word population is in the dataset
Word community is in the dataset
Word race is in the dataset
Word race is in the dataset
Word action is in the dataset
Word education is in the dataset
Word energy is in the dataset
Word energy is in the dataset
Word reports is in the dataset
Word violence is in the dataset
Word budget is in the dataset
Word stop is in the dataset
Word management is in the dataset
Word cases is in the dataset
Word gas is in the dataset
Word gas is in the dataset
Word gas is in the dataset
Word justice is in the dataset
Word data is in the dataset
Word data is in the dataset
Word green is in the dataset
Word green is in the dataset
Word income is in the d

In [21]:
tag_embed_matrix = np.vstack(tuple(tag_embeddings)).T

array([['0.49725', '0.49725', '0.49725', ..., '0.022897', '0.16528',
        '-0.2446'],
       ['-1.1949', '-1.1949', '-1.1949', ..., '0.49796', '-0.30625',
        '-0.36988'],
       ['0.37137', '0.37137', '0.37137', ..., '0.91822', '0.60297',
        '0.079561'],
       ..., 
       ['0.12378', '0.12378', '0.12378', ..., '1.4032', '0.17151',
        '0.10161'],
       ['0.85397', '0.85397', '0.85397', ..., '0.083728', '-0.36362',
        '0.90325'],
       ['-0.65035', '-0.65035', '-0.65035', ..., '0.30038', '-0.27778',
        '-0.041727']],
      dtype='<U11')

In [15]:
# np.matmul(category_embeds, tag_embed_matrix)

In [22]:
np.shape(tag_embed_matrix)

(50, 91)

In [23]:
np.shape(category_embeds)

(4, 50)

In [24]:
category_embeds = category_embeds.astype(np.float32)
tag_embed_matrix = tag_embed_matrix.astype(np.float32)

In [25]:
np.matmul(category_embeds, tag_embed_matrix).shape

(4, 91)

In [28]:
normalized_tag_embeds = (tag_embed_matrix * (1 / np.linalg.norm(tag_embed_matrix, axis=0)))

In [29]:
normalized_category_embeds = category_embeds * np.expand_dims(1 / np.linalg.norm(category_embeds, axis=1), -1)


In [22]:
sim_matrix = np.matmul(normalized_category_embeds, normalized_tag_embeds)

In [23]:
normalized_category_embeds

array([[ 0.08112953, -0.19495562,  0.0605914 , -0.01332368,  0.11276393,
        -0.11418013, -0.04196873,  0.09696385,  0.0097858 , -0.24457146,
        -0.01162   , -0.16402115, -0.12048285, -0.06687456,  0.070278  ,
        -0.07628705, -0.05954883,  0.04841504, -0.10242144, -0.06782902,
         0.04668558,  0.1911867 , -0.03510474,  0.10120429, -0.1390778 ,
        -0.40253952,  0.02351737,  0.01083603, -0.06186239, -0.10666024,
         0.44838646, -0.04708042, -0.0740877 , -0.22091378,  0.09550198,
         0.16498376,  0.11048137, -0.19102353,  0.0595113 ,  0.19392771,
        -0.04686994,  0.20055187,  0.09542856, -0.03365101,  0.14824229,
        -0.14414707, -0.13882163,  0.0201955 ,  0.13933069, -0.10610878],
       [ 0.12164826,  0.17819379, -0.1556326 ,  0.0909054 ,  0.10886825,
        -0.19266538, -0.08047238,  0.06235011, -0.11080075, -0.17644373,
        -0.08008447, -0.08139878, -0.09180169,  0.13137275,  0.10782318,
         0.08602189, -0.02524294, -0.11464981,  0.

In [27]:
masked_sim_matrix = sim_matrix * (sim_matrix >= 0)
masked_sim_matrix

array([[ 1.        ,  1.        ,  1.        ,  1.        ,  0.37782702,
         0.50888675,  0.51192677,  0.36461431,  0.33621722,  0.27145368,
         0.27145368,  0.3727001 ,  0.47038737,  0.28251207,  0.28251207,
         0.54297966,  0.34372619,  0.20126359,  0.20126359,  0.66294223,
         0.57497567,  0.22185263,  0.62788248,  0.3836084 ,  0.55254734,
         0.39194855,  0.39194855,  0.39194855,  0.5385704 ,  0.34456822,
         0.34456822,  0.32892108,  0.32892108,  0.19693632,  0.30481961,
         0.21267366,  0.21267366,  0.21267366,  0.44482815,  0.62654668,
         0.62654668,  0.62654668,  0.38475591,  0.38475591,  0.56387287,
         0.68423003,  0.37948063,  0.5721826 ,  0.33391023,  0.33391023,
         0.58605242,  0.58605242,  0.12189772,  0.32167047,  0.38043648,
         0.43361136,  0.41414142,  0.68039668,  0.68039668,  0.27552515,
         0.27552515,  0.27552515,  0.19996311,  0.68338013,  0.25361446,
         0.38162985,  0.56028682,  0.65576679,  0.3

In [28]:
np.sum(masked_sim_matrix)/(masked_sim_matrix.shape[0] * masked_sim_matrix.shape[1])

0.34174143613039792