In [1]:
import os
import sys
import time

import re
import random
import collections

import numpy as np
import networkx as nx
import tensorflow as tf

from sklearn.manifold import TSNE

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
sep = "\t"
repeat_time = 2**(16-10)

# 1.1 Build-up a Graph by networkx from event_relation.tsv

In [3]:
sep = "\t"

g = nx.Graph()
with open("event_relation1.tsv", "rb") as in_file:
    in_file.next()
    for line in in_file:
        event, people = line.strip().split(sep)
        people = people.split(",")
        
        for curr_idx in range(len(people)):
            for next_idx in range(curr_idx+1, len(people)):
                curr_person = people[curr_idx]
                next_person = people[next_idx]
                
                if g.has_edge(curr_person, next_person):
                    data = g.get_edge_data(curr_person, next_person)
                    g.add_edge(curr_person, next_person, key="edge", weight=data['weight']+1)
                else:
                    g.add_edge(curr_person, next_person, weight=1)

# 1.3 Read Gruth-True Labals for People

In [None]:
def read_label(filepath="people_label.txt"):
    labels = {}
    
    with open(filepath, "rb") as in_file:
        for line in in_file:
            people, label = line.strip().split(sep)
            
            labels[people] = int(label)*2.0
            
    return labels

people_labels = read_label()

# 2.1 Apply Random Walk in Graph to describe the network

In [6]:
class RandomWalkInGraph(object):
    def __init__(self, g, length=2**5):
        self.network = g
        self.length = length

        self.probability = {}
        '''
        key, value
        =======================
        people_0001, [people_0010, people_0021, people_0003]
        people_0001, [people_0010, people_0021, people_0003]
        '''
        
        self.probability_to_next_vertex()
        
    def probability_to_next_vertex(self):
        pass

    def next_vertex(self, node_start):
        '''
        path = [people_0001, people_0004, people_0010...]
        '''
        path = [node_start]
        
        # TODO: ....
        
        return path

    def random_walk(self):
        nodes = self.network.nodes()
        np.random.shuffle(nodes)
        
        for node_start in nodes:
            yield self.next_vertex(node_start)

In [None]:
words = []

rkig = RandomWalkInGraph(g)

timestamp_start = time.time()

for idx in range(repeat_time):
    for path in rkig.random_walk():
        words.extend(path)
    
    if idx%(repeat_time/4) == 0:
        timestamp_end = time.time()

        print "Iteration {:4d}| Spend {:.4f} seconds to add {} into the words".format(idx+1, timestamp_end-timestamp_start, len(words))
        
        timestamp_start = timestamp_end
        
print "The size of words is {}".format(len(words))

count = []
count.extend(collections.Counter(words).most_common(len(set(g.nodes()))))
vocabulary = []

for word, c in sorted(count, key=lambda x: x[1], reverse=True):
    vocabulary.append(word)

print "The size of vocabulary is {}".format(len(vocabulary))

# 3.1 Claim the Language Model with SKIPGRAM methodology

In [8]:
class Word2Vec(object):
    SIZE_DICTIONARY = 2**16
    
    def __init__(self, words, 
                 method="skipgram", num_sample=2, window_size=1, embed_dimension=2**8, num_negative_samples=64, 
                 validation_sample=16):
        
        self.method = method
        self.num_sample = num_sample
        self.window_size = window_size
        
        self.embed_dimension = embed_dimension
        self.num_negative_samples = num_negative_samples
        
        self.dictionary = {}
        self.reversed_dictionary = {}
        
        self.batch_idx = 0
        self.training_dataset = []
        
        self.set_dictionaries(words)
        self.set_training_dataset(words)
        self.set_validation_dataset(validation_sample)
        
    def set_dictionaries(self, words):
        count = []
        
        most_common = collections.Counter(words).most_common(self.SIZE_DICTIONARY-1)
        for idx, (word, cc) in enumerate(most_common):
            self.dictionary[word] = len(self.dictionary)
            self.reversed_dictionary[self.dictionary[word]] = word
            
        print("The size of dictionary is {}".format(len(self.dictionary)))
            
    def set_training_dataset(self, words):
        for word in words:
            self.training_dataset.append(self.dictionary[word])
            
        print("The shape of training dataset is {}".format(len(self.training_dataset)))
            
    def set_validation_dataset(self, validation_sample):
        self.validation_dataset = np.array(random.sample(range(64), validation_sample))
            
    def next_batch(self, batch_size):
        assert batch_size%self.num_sample == 0
        assert self.num_sample <= 2*self.window_size
        
        span = 1+2*self.window_size
        
        buffer = collections.deque(maxlen=span)
        def move_on():
            next_word = self.training_dataset[self.batch_idx]
            buffer.append(next_word)
            self.batch_idx = (self.batch_idx+1)%len(self.training_dataset)
            
        for _ in range(span):
            move_on()
            
        datasets, labels = [], []
        for idx in range(batch_size // self.num_sample):
            target_word = self.window_size
            context_word = target_word
            context_words = [context_word]
            
            for _ in range(self.num_sample):
                while context_word in context_words:
                    context_word = np.random.randint(0, span)
                context_words.append(context_word)
                
                datasets.append(buffer[target_word])
                labels.append([buffer[context_word]])
                
            move_on()
            
        return np.array(datasets, dtype=np.int32), np.array(labels, dtype=np.int32)
    
    def train(self, learning_rate=1.0, batch_size=128, n_epoch=2**15, printing_epoch=2**12, top_k=8):
        print("The size of vocabulary is {}, the dimension of embeddings is {}".format(len(self.dictionary), self.embed_dimension))
        print("The shape of validation dataset is {}".format(len(self.validation_dataset)))
        
        tf.reset_default_graph()
        
        train_x = tf.placeholder(tf.int32, shape=[None])
        train_y = tf.placeholder(tf.int32, shape=[None, 1])
        validate_x = tf.constant(self.validation_dataset, dtype=np.int32)
        
        embeddings = tf.Variable(tf.random_uniform([len(self.dictionary), self.embed_dimension], -1.0, 1.0), dtype=tf.float32)
        embeddings_x = tf.nn.embedding_lookup(embeddings, train_x)
        
        softmax_weights = tf.Variable(tf.truncated_normal([len(self.dictionary), self.embed_dimension], 
                                                          stddev=1.0 / np.sqrt(self.embed_dimension)))
        softmax_biases = tf.Variable(tf.zeros([len(self.dictionary)]))
        
        sampled_softmax_loss = tf.nn.sampled_softmax_loss(weights=softmax_weights, 
                                                          biases=softmax_biases, 
                                                          inputs=embeddings_x,
                                                          labels=train_y, 
                                                          num_sampled=self.num_negative_samples, 
                                                          num_classes=len(self.dictionary))
        loss = tf.reduce_mean(sampled_softmax_loss)
        
        optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)
        
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        norm_embeddings = embeddings / norm
        
        embeddings_validation = tf.nn.embedding_lookup(norm_embeddings, validate_x)
        similarity = tf.matmul(embeddings_validation, tf.transpose(norm_embeddings))
        
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            
            average_loss = 0.0
            timestamp_start = time.time()
            for epoch in range(n_epoch):
                batch_x, batch_y = self.next_batch(batch_size)
                fd = {train_x: batch_x, train_y: batch_y}

                _, l = sess.run([optimizer, loss], feed_dict=fd)
                average_loss += l
                
                if epoch > 0 and (epoch == n_epoch-1 or epoch%printing_epoch == 0):
                    timestamp_end = time.time()
                    
                    print("Epoch {:5d}| seconds: {:3.2f}, average_loss: {:2.12f}".\
                          format(epoch, timestamp_end-timestamp_start, average_loss/printing_epoch))
                    
                    timestamp_start = timestamp_end
                    average_loss = 0.0
        
                if epoch > 0 and (epoch==n_epoch-1 or epoch%(printing_epoch*4) == 0):
                    sim = similarity.eval()
                    for idx, word in enumerate(self.validation_dataset):
                        validation_word = self.reversed_dictionary[word]
                        nearest = (-sim[idx, :]).argsort()[1:top_k+1]
                        log = "Nearest to {}:".format(validation_word)

                        for k in range(top_k):
                            close_word = self.reversed_dictionary[nearest[k]]
                            log = "{} {},".format(log, close_word)
                        
                        print(log)
                        
            self.embeddings = embeddings.eval()
            self.normalized_embeddings = norm_embeddings.eval()

In [None]:
word2vec = Word2Vec(words, num_sample=2, window_size=1, embed_dimension=2**8)
word2vec.train(learning_rate=1.0, n_epoch=2**16, printing_epoch=2**12, top_k=4)

In [14]:
class VisualizeEmbeddings(object):
    def __init__(self):
        pass
    
    def cluster_embeddings(self, counts, embeddings, 
                           method=TSNE(perplexity=30, n_components=2, init='pca', n_iter=2**14), 
                           plot_only=2**9):        
        low_dim_embs = method.fit_transform(embeddings[:plot_only,:])
        labels = [counts[i] for i in range(plot_only)]
        
        self.plot_embeddings(low_dim_embs, labels)
    
    def plot_embeddings(self, low_dim_embs, labels):
        global people_labels
        
        assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
        
        plt.figure(figsize=(18, 18))
        for i, label in enumerate(labels):
            x, y = low_dim_embs[i,:]
            plt.annotate(label[7:],
                         xy=(x, y),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')
        
        plt.scatter(low_dim_embs[:,0], low_dim_embs[:,1], c=[people_labels[label] for label in labels])
        
        plt.colorbar()
        plt.grid()

In [None]:
v = VisualizeEmbeddings()
v.cluster_embeddings(vocabulary, word2vec.normalized_embeddings, plot_only=len(vocabulary)) 