In [1]:
import os

import numpy as np
import tensorflow as tf

In [2]:
def load_embedding_weights(text_dir, filename):
    """Loads the word embedding weights from a pre-trained model.
    
    Parameters:
        text_dir: The directory containing the text file with the weights.
        filename: The name of that text file.
        
    Returns:
        vocabulary: A list containing the words in the vocabulary.
        embedding: A numpy array of the weights.
    """
    vocabulary = []
    embedding = []
    with open(os.path.join(text_dir, filename), 'rb') as f:
        for line in f.readlines():
            row = line.strip().split(' ')
            vocabulary.append(row[0])
            embedding.append(map(np.float32, row[1:]))
        embedding = np.array(embedding)
        print('Loaded word embedding weights.')
    return vocabulary, embedding

text_dir = 'text_model'
filename = 'glove.6B.50d.txt'
vocabulary, embedding = load_embedding_weights(text_dir, filename)
vocab_size, embedding_dim = embedding.shape

Loaded word embedding weights.


In [4]:
W_embedding = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]),
                          trainable=False, name='W_embedding')
embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])
embedding_init = W_embedding.assign(embedding_placeholder)
sess = tf.Session()
sess.run(embedding_init, feed_dict={embedding_placeholder: embedding})

array([[ 0.41800001,  0.24968   , -0.41242   , ..., -0.18411   ,
        -0.11514   , -0.78580999],
       [ 0.013441  ,  0.23682   , -0.16899   , ..., -0.56656998,
         0.044691  ,  0.30392   ],
       [ 0.15164   ,  0.30177   , -0.16763   , ..., -0.35652   ,
         0.016413  ,  0.10216   ],
       ..., 
       [-0.51181   ,  0.058706  ,  1.09130001, ..., -0.25003001,
        -1.125     ,  1.58630002],
       [-0.75897998, -0.47426   ,  0.47369999, ...,  0.78953999,
        -0.014116  ,  0.64480001],
       [ 0.072617  , -0.51393002,  0.47279999, ..., -0.18907   ,
        -0.59021002,  0.55558997]], dtype=float32)

In [5]:
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(10)
vocab_processor.fit(vocabulary)

<tensorflow.contrib.learn.python.learn.preprocessing.text.VocabularyProcessor at 0x104e06190>

In [6]:
list(vocab_processor.transform(['hey there', 'my name is']))

[array([7818,   57,    0,    0,    0,    0,    0,    0,    0,    0]),
 array([184, 301,  12,   0,   0,   0,   0,   0,   0,   0])]

In [53]:
import pandas as pd

df = pd.read_csv('data/disgusted.csv', encoding='utf-8')

In [55]:
ids = df['id']
df['post_url'][ids == 161651437343][31]

u'https://idreamtofflying.tumblr.com/post/161651437343/me-when-i-see-a-couple-expressing-their-affection'