In [1]:
import errno
import os
import tensorflow as tf
import urllib



def maybe_download(url, local_dir, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    mkdir_p(local_dir)
    local_filename = url.split('/')[-1]
    local_filepath = os.path.join(local_dir, local_filename)
    if not os.path.exists(local_filepath):
        print("Downloading %s..." % local_filename)
        local_filename, _ = urllib.request.urlretrieve(url,
                                                       local_filepath)
        print("Finished!")
    statinfo = os.stat(local_filepath)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', local_filepath)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + local_filename +
                        '. Can you get to it with a browser?')
    return local_filename


def mkdir_p(path):
    """From https://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python"""
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


def get_session():
    """Create a session that dynamically allocates memory."""
    # See: https://www.tensorflow.org/tutorials/using_gpu#allowing_gpu_memory_growth
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    session = tf.Session(config=config)
    return session

  from ._conv import register_converters as _register_converters


In [3]:
import zipfile

import numpy as np

# maybe_download('http://nlp.stanford.edu/data/glove.6B.zip', 'datasets', 862182613)
# if not os.path.exists(os.path.join("datasets", "glove.6B.50d.txt")):
#     with zipfile.ZipFile(os.path.join("datasets", "glove.6B.zip"), "r") as zip_ref:
#         zip_ref.extractall("datasets")
#     for f in ["glove.6B.100d.txt", "glove.6B.300d.txt", "glove.6B.200d.txt"]:
#         os.remove(os.path.join('datasets', f))

In [4]:
import os
import numpy as np
# import tensorflow as tf

def load_embeddings(filename):
    vocab = []
    embed = []
    with open(filename, 'r', encoding="utf-8") as f:
        for line in f.readlines():
            row = line.strip().split(' ')
            vocab.append(row[0])
            embed.append(row[1:])
    embed = np.asarray(embed)
    return vocab, embed


# Load the GloVe vectors into numpy
# glove_filepath = os.path.join('datasets', 'glove.6B.50d.txt')
glove_filepath = '/Users/akhiljalan/Desktop/glove.6B.50d.txt'
vocab, embed = load_embeddings(glove_filepath)
vocab_size = len(vocab)
embed_dim = len(embed[0])
assert vocab_size > 0, "The vocabulary shouldn't be empty; did you download the GloVe weights?"
print('Loaded %d %d-dimensional embeddings.' % (vocab_size, embed_dim))

# word2id = {}
# id2word = vocab
# for i, w in enumerate(id2word):
#     word2id[w] = i

# Ops to load the embeddings into TensorFlow
# embedding = tf.Variable(tf.constant(0.0, shape=[vocab_size, embed_dim]),
#                         trainable=False, name="embedding")
# embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embed_dim])
# embedding_init = embedding.assign(embedding_placeholder)

Loaded 400000 50-dimensional embeddings.


TODO pick general categories later.

In [29]:
def get_embed_for_words(words):
    word_embeds = []
    final_words = []
    for i in range(400000):
        for word in words:
            if word == vocab[i]: 
#                 print('Word {} is in the dataset'.format(vocab[i]))
                final_words.append(word)
                word_embeds.append(embed[i])
                words.remove(vocab[i])
    return final_words, word_embeds

In [57]:
def matricize_embed_list(indiv_embeds):
        embeds = np.vstack(tuple(indiv_embeds)).astype(np.float32)
        row_norms = np.repeat(np.expand_dims(1 / np.linalg.norm(embeds, axis=1), 1), 
                              repeats=len(embeds[0]), axis=1)
        normalized_embeds = embeds * row_norms
        return normalized_embeds
    
def load_category_words(category_word_list):
    category_words, indiv_embeds = get_embed_for_words(category_word_list)
    normalized_cat_embeds = matricize_embed_list(indiv_embeds)
    return category_words, normalized_cat_embeds

In [38]:
words, embeds = load_category_words(['happy', 'sad', 'death'])

In [8]:
general_categories = ['police', 'budget', 'meeting', 'election']
# category_words, category_embeds = get_embed_for_words(general_categories)

In [41]:
def load_with_repeats(filename): 
    with open(filename, 'r') as f:
        all_cat = f.readline().split(',')
    return all_cat

In [42]:
tag_phrases = load_with_repeats('../categories.txt')

In [44]:
tag_words = []
for phrase in tag_phrases: 
    for x in phrase.strip().split(' '):
        tag_words.append(x)

In [46]:
tag_words, tag_embeddings = get_embed_for_words(tag_words)

In [58]:
tag_embeds = matricize_embed_list(tag_embeddings)

In [59]:
np.shape(tag_embeds)

(91, 50)

In [60]:
np.shape(embeds)

(3, 50)

In [25]:
np.matmul(category_embeds, tag_embed_matrix).shape

(4, 91)

In [64]:
def category_sim_scores(category_embeds, word_embeds): 
    sim_matrix = np.matmul(category_embeds, word_embeds)
    masked_sim_matrix = sim_matrix * (sim_matrix >= 0)
    return np.sum(masked_sim_matrix, axis = 1) / sim_matrix.shape[1]

In [66]:
def compute_similarites(category_word_list, tag_word_list): 
    # get embedding matrices
    cat_words, cat_embeds = get_embed_for_words(category_word_list)
    cat_embed_matrix = matricize_embed_list(cat_embeds)
    tag_words, tag_embeds = get_embed_for_words(tag_word_list)
    tag_embed_matrix = matricize_embed_list(tag_embeds)
    
    # return category sim scores 
    sim_scores = category_sim_scores(cat_embed_matrix, tag_embed_matrix.T)
    for word, score in zip(cat_words, sim_scores): 
        print('Similitarity to {}: {}'.format(word.upper(), score))
    # need both matrices
    # print the words on top!!

In [88]:
def compute_categorical_similarities(category_word_list_of_lists, tag_word_list): 
    # tag words are "content" words
    '''
    Input: 
    category_word_list_of_lists: List of lists of strings. 
    Each list corresponds to a number of related category words, such as 
    ['park', 'forest', 'nature']
    
    tag_word_list: List of strings. These are the datasets we've scraped. 
    
    Returns: 
    final_cats_nested: A list of lists, which should be identical to category_word_list_of_lists
    in the ideal case. Some of the inner lists might not contain all the original words, because 
    GloVe doesn not have an embedding for every word in the English language. 
    
    final_scores: Numerical scores between [0, 1] for each category-list. 
    Computed as an average over all the tags in the tag_word_list. 
    '''
    tag_words, tag_embeds = get_embed_for_words(tag_word_list)
    tag_embed_matrix = matricize_embed_list(tag_embeds)
    final_scores = []
    final_cats_nested = []
    for cat_word_list in category_word_list_of_lists: 
        cat_words, cat_embeds = get_embed_for_words(cat_word_list)
        cat_embed_matrix = matricize_embed_list(cat_embeds)
        sim_scores = category_sim_scores(cat_embed_matrix, tag_embed_matrix.T)
        final_scores.append(np.mean(sim_scores))
        final_cats_nested.append(cat_words)
        print('Similarity to all words {}: {}'.format(cat_words, final_scores[-1]))
    return final_cats_nested, final_scores

In [87]:
import pandas as pd

In [90]:
tag_words

['police',
 'police',
 'age',
 'race',
 'energy',
 'gas',
 'data',
 'green',
 'natural',
 'crime',
 'housing',
 'census',
 'vehicle',
 'enforcement',
 'electricity']

In [91]:
# list(set(tag_words))
nested_cats = [['safety', 'police'], ['school', 'pupil', 'teacher']]

In [92]:
category_lists, scores = compute_categorical_similarities(nested_cats, list(set(tag_words)))

Similarity to all words ['police', 'safety']: 0.44999271631240845
Similarity to all words ['school', 'teacher', 'pupil']: 0.2811836004257202


In [86]:
pd.DataFrame(scores, columns=category_lists)

ValueError: all arrays must be same length

In [67]:
whole_thing(['happy', 'sad', 'death'], tag_words)

Similitarity to DEATH: 0.3337719440460205
Similitarity to HAPPY: 0.24990543723106384
Similitarity to SAD: 0.15696865320205688


In [65]:
category_sim_scores(embeds, tag_embeds.T)

array([0.34142044, 0.2540078 , 0.15936016], dtype=float32)

In [22]:
sim_matrix = np.matmul(normalized_category_embeds, normalized_tag_embeds)

In [27]:
masked_sim_matrix = sim_matrix * (sim_matrix >= 0)
masked_sim_matrix

array([[ 1.        ,  1.        ,  1.        ,  1.        ,  0.37782702,
         0.50888675,  0.51192677,  0.36461431,  0.33621722,  0.27145368,
         0.27145368,  0.3727001 ,  0.47038737,  0.28251207,  0.28251207,
         0.54297966,  0.34372619,  0.20126359,  0.20126359,  0.66294223,
         0.57497567,  0.22185263,  0.62788248,  0.3836084 ,  0.55254734,
         0.39194855,  0.39194855,  0.39194855,  0.5385704 ,  0.34456822,
         0.34456822,  0.32892108,  0.32892108,  0.19693632,  0.30481961,
         0.21267366,  0.21267366,  0.21267366,  0.44482815,  0.62654668,
         0.62654668,  0.62654668,  0.38475591,  0.38475591,  0.56387287,
         0.68423003,  0.37948063,  0.5721826 ,  0.33391023,  0.33391023,
         0.58605242,  0.58605242,  0.12189772,  0.32167047,  0.38043648,
         0.43361136,  0.41414142,  0.68039668,  0.68039668,  0.27552515,
         0.27552515,  0.27552515,  0.19996311,  0.68338013,  0.25361446,
         0.38162985,  0.56028682,  0.65576679,  0.3

In [28]:
np.sum(masked_sim_matrix)/(masked_sim_matrix.shape[0] * masked_sim_matrix.shape[1])

0.34174143613039792

In [63]:
np.size(np.random.randn(5, 5, 5))

125