In [11]:
from collections import Counter
from itertools import dropwhile, chain

import pandas as pd
import numpy as np

from gensim.models import Word2Vec as GensimWord2Vec
from gensim.models.word2vec import Vocab
from gensim.utils import tokenize

import time
import logging
import heapq
from copy import deepcopy
import numpy as np
from scipy.spatial.distance import cdist
from math import sqrt

In [12]:
df = pd.read_csv('ag_news_csv/train.csv', delimiter=',',header=None, names=('Label', 'Title', 'Caption'))

In [13]:
X_train = df[['Title', 'Caption']].apply(lambda x: list(tokenize(' '.join(x))), axis=1)
label_names = {1:'LABELL1', 2:'LABELL2', 3:'LABELL3', 4:'LABELL4'} 
y_train = df['Label'].apply(lambda x: label_names[x])

min_count = 100
vocab_count = Counter(chain(*X_train))
for key, count in dropwhile(lambda key_count: key_count[1] >= min_count, vocab_count.most_common()):
    del vocab_count[key]
    
X_train = X_train.apply(lambda x: [word if word in vocab_count else 'UNK' for word in x])
sentences = [row for row in X_train.head()]

In [34]:
logger = logging.getLogger("word2vec")

class Word2Vec():
    """
    Word2Vec Model, which can be trained and then contains word embedding that can be used for all kinds of cool stuff.
    """
    def __init__(self, sentences=None, mtype='cbow', embed_dim=10, hs=1, neg=0, thr=0,
                 window=2, min_count=1, alpha=0.025, min_alpha=0.0001, seed=1):
        """
        Initialize Word2Vec model
        Inputs:
            - sentences: (default None) List or generator object supplying lists of (preprocessed) words
                         used to train the model (otherwise train manually with model.train(sentences))
            - mtype: (default 'sg') type of model: either 'sg' (skipgram) or 'cbow' (bag of words)
            - embed_dim: (default 100) dimensionality of embedding
            - hs: (default 1) if != 0, hierarchical softmax will be used for training the model
            - neg: (default 0) if > 0, negative sampling will be used for training the model; 
                   neg specifies the # of noise words
            - thr: (default 0) threshold for computing probabilities for sub-sampling words in training
            - window: (default 5) max distance of context words from target word in training
            - min_count: (default 5) how often a word has to occur at least to be taken into the vocab
            - alpha: (default 0.025) initial learning rate
            - min_alpha: (default 0.0001) if < alpha, the learning rate will be decreased to min_alpha
            - seed: (default 1) random seed (for initializing the embeddings)
        """
        assert mtype in ('sg','cbow'), "unknown model, use 'sg' or 'cbow'"
        self.vocab = {} # mapping from a word (string) to a Vocab object
        self.index2word = []  # map from a word's matrix index (int) to the word (string)
        self.mtype = mtype
        self.embed_dim = embed_dim
        self.hs = hs
        self.neg = neg
        self.thr = thr
        self.window = window
        self.min_count = min_count
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.seed = seed
        # possibly train model
        if sentences:
            self.build_vocab(sentences)
            self.train(sentences)

            
    def reset_weights(self):
        """
        Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.
        """        
        np.random.seed(self.seed)
        # weights
        self.syn1 = np.asarray(
                        np.random.uniform(
                            low=-4*np.sqrt(6. / (len(self.vocab) + self.embed_dim)),
                            high=4*np.sqrt(6. / (len(self.vocab) + self.embed_dim)),
                            size=(len(self.vocab), self.embed_dim)
                        ),
                        dtype=float
                    )

        # embedding        
        self.syn0 = np.asarray(
                        np.random.uniform(
                            low=-4*np.sqrt(6. / (len(self.vocab) + self.embed_dim)),
                            high=4*np.sqrt(6. / (len(self.vocab) + self.embed_dim)),
                            size=(len(self.vocab), self.embed_dim)
                        ),
                        dtype=float
                    )#(np.random.rand(len(self.vocab), self.embed_dim) - 0.5) / self.embed_dim
        self.syn0norm = None
        
    def _create_binary_tree(self):
        """
        Create a binary Huffman tree using stored vocabulary word counts. Frequent words
        will have shorter binary codes. Called internally from `build_vocab()`.
        """
        vocab_size = len(self.vocab)
        logger.info("constructing a huffman tree from %i words"%vocab_size)
        # build the huffman tree
        #heap = [self.vocab['LABELL1'],self.vocab['LABELL2'],
        #        self.vocab['LABELL3'],self.vocab['LABELL4'] ] ## list(self.vocab.values()) 
        heap = list(self.vocab.values()) 
        #print (heap[0])
        #print (self.vocab['LABELL1'])
        heapq.heapify(heap)
        
        for i in range(vocab_size - 1):
            
            min1, min2 = heapq.heappop(heap), heapq.heappop(heap)
            heapq.heappush(heap, Vocab(count=min1.count + min2.count, index=i + vocab_size, left=min1, right=min2))
            
        # recurse over the tree, assigning a binary code to each vocabulary word
        if heap:
            max_depth, stack = 0, [(heap[0], [], [])]
            while stack:
                node, codes, points = stack.pop()
                if node.index < vocab_size:
                    # leaf node => store its path from the root
                    node.code, node.point = codes, points
                    max_depth = max(len(codes), max_depth)
                else:
                    # inner node => continue recursion
                    points = np.array(list(points) + [node.index - vocab_size], dtype=int)
                    #print (points)
                    stack.append((node.left, np.array(list(codes) + [0], dtype=int), points))
                    stack.append((node.right, np.array(list(codes) + [1], dtype=int), points))
            logger.info("built huffman tree with maximum node depth %i"%max_depth)
            
            
    def build_vocab(self, sentences, hs=1, neg=False, thr=0):
        """
        Build vocabulary from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of strings.
        """
        # chance to change your mind about the training type
        if not hs is False:
            self.hs = hs
        if not neg is False:
            self.neg = neg
        if not thr is False:
            self.thr = thr
        logger.info("collecting all words and their counts")
        sentence_no, vocab = -1, {}
        total_words = 0
        for sentence_no, sentence in enumerate(sentences):
            if not sentence_no % 10000:
                logger.info("PROGRESS: at sentence #%i, processed %i words and %i unique words" %
                    (sentence_no, total_words, len(vocab)))
            for word in sentence:
                total_words += 1
                try:
                    vocab[word].count += 1
                except KeyError:
                    vocab[word] = Vocab(count=1)
        
        for k, v in inject_label_counts(X_train, y_train).items():
            vocab[k] = Vocab(count=v)
        
        logger.info("collected %i unique words from a corpus of %i words and %i sentences" %
            (len(vocab), total_words, sentence_no + 1))
        # assign a unique index to each word
        self.vocab, self.index2word = {}, []
        for word, v in vocab.items():
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v
        logger.info("total of %i unique words after removing those with count < %s" % (len(self.vocab), self.min_count))
        # add probabilities for sub-sampling (if self.thr > 0)
        if self.thr > 0:
            total_words = float(sum(v.count for v in self.vocab.values()))
            for word in self.vocab:
                # formula from paper
                #self.vocab[word].prob = max(0.,1.-sqrt(self.thr*total_words/self.vocab[word].count))
                # formula from code
                self.vocab[word].prob = (sqrt(self.vocab[word].count / (self.thr * total_words)) + 1.) * (self.thr * total_words) / self.vocab[word].count
        else:
            # if prob is 0, word wont get discarded 
            for word in self.vocab:
                self.vocab[word].prob = 0.
        # add info about each word's Huffman encoding
        if self.hs:
            self._create_binary_tree()
        # initialize layers
        self.reset_weights()

    def train_sentence_cbow(self, sentence, alpha, sentence_no):
        """
        Update a cbow model by training on a single sentence
        using hierarchical softmax and/or negative sampling.
        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Word2Vec.train()`.
        """
        if self.neg:
            # precompute neg noise labels
            labels = np.zeros(self.neg+1)
            labels[0] = 1.
        for pos, word in enumerate(sentence):
            if not word or (word.prob and word.prob < np.random.rand()):
                continue  # OOV word in the input sentence or subsampling => skip
            reduced_window = np.random.randint(self.window-1) # how much is SUBSTRACTED from the original window
            # get sum of representation from all words in the (reduced) window (if in vocab and not the `word` itself)
            start = max(0, pos - self.window + reduced_window)
            ##word2_indices = [word2.index for pos2, word2 in enumerate(sentence[start:pos+self.window+1-reduced_window], start) if (word2 and not (pos2 == pos))]
            #print (self.vocab[y_train[sentence_no]])
            word2_indices = [self.vocab[y_train[sentence_no]].index]
            if not word2_indices:
                # in this case the sum would return zeros, the mean nans but really no point in doing anything at all
                continue
            l1 = np.sum(self.syn0[word2_indices], axis=0) # 1xlayer1_size
            if self.hs:
                # work on the entire tree at once --> 2d matrix, codelen x layer1_size
                l2 = deepcopy(self.syn1[word.point])
                # propagate hidden -> output
                f = 1. / (1. + np.exp(-np.dot(l1, l2.T)))
                # vector of error gradients multiplied by the learning rate
                g = (1. - word.code - f) * alpha
                # learn hidden -> output
                self.syn1[word.point] += np.outer(g, l1)
                # learn input -> hidden, here for all words in the window separately
                self.syn0[word2_indices] += np.dot(g, l2)
        return len([word for word in sentence if word])

    def train(self, sentences, mtype=False, alpha=False, min_alpha=False):
        """
        Update the model's embedding and weights from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of strings.
        There is the option to change the model type again (but not the type of training (hs or neg))
        """
        logger.info("training model on %i vocabulary and %i features" % (len(self.vocab), self.embed_dim))
        if not self.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")
        if not mtype is False and mtype in ('sg','cbow'):
            self.mtype = mtype
        if alpha:
            self.alpha = alpha
        if min_alpha:
            self.min_alpha = min_alpha

        start, next_report = time.time(), 20.
        total_words = sum(v.count for v in self.vocab.values())
        word_count = 0
        for sentence_no, sentence in enumerate(sentences):
            # convert input string lists to Vocab objects (or None for OOV words)
            no_oov = [self.vocab.get(word, 'UNK') for word in sentence]
            # update the learning rate before every iteration
            alpha = self.min_alpha + (self.alpha-self.min_alpha) * (1. - word_count / total_words)
            # train on the sentence and check how many words did we train on (out-of-vocabulary (unknown) words do not count)
            word_count += self.train_sentence_cbow(no_oov, alpha, sentence_no)
            # report progress
            elapsed = time.time() - start
            if elapsed >= next_report:
                logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" %
                    (100.0 * word_count / total_words, alpha, word_count / elapsed if elapsed else 0.0))
                next_report = elapsed + 20.  # don't flood the log, wait at least a second between progress reports
        elapsed = time.time() - start
        logger.info("training on %i words took %.1fs, %.0f words/s" %
            (word_count, elapsed, word_count / elapsed if elapsed else 0.0))
        # for convenience (for later similarity computations, etc.), store all embeddings additionally as unit length vectors
        self.syn0norm = self.syn0/np.array([np.linalg.norm(self.syn0,axis=1)]).T

    def __getitem__(self, word):
        """
        Return a word's representations in vector space, as a 1D numpy array.
        Example:
          >>> trained_model['woman']
          array([ -1.40128313e-02, ...]
        """
        return self.syn0[self.vocab[word].index]

    def __contains__(self, word):
        return word in self.vocab

    def __str__(self):
        return "Word2Vec(vocab=%s, size=%s, mtype=%s, hs=%i, neg=%i)" % (len(self.index2word), self.embed_dim, self.mtype, self.hs, self.neg)

    def most_similar(self, positive=[], negative=[], topn=10):
        """
        Find the top-N most similar words. Positive words contribute positively towards the
        similarity, negative words negatively.
        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given words, and corresponds to the `word-analogy` and
        `distance` scripts in the original word2vec implementation.
        Example::
          >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
          [('queen', 0.50882536), ...]
        """
        if isinstance(positive, str) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        positive = [(word, 1.) if isinstance(word, str) else word for word in positive]
        negative = [(word, -1.) if isinstance(word, str) else word for word in negative]

        # compute the weighted average of all words
        all_words = set()
        mean = np.zeros(self.embed_dim)
        for word, weight in positive + negative:
            try:
                mean += weight * self.syn0norm[self.vocab[word].index]
                all_words.add(self.vocab[word].index)
            except KeyError:
                print ("word '%s' not in vocabulary" % word)
        if not all_words:
            raise ValueError("cannot compute similarity with no input")
        dists = np.dot(self.syn0norm, mean/np.linalg.norm(mean))
        if not topn:
            return dists
        best = np.argsort(dists)[::-1][:topn + len(all_words)]
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], dists[sim]) for sim in best if sim not in all_words]
        return result[:topn]

In [35]:
def inject_label_counts(X_train, y_train):
    num_tokens_per_label = {'LABELL1':0, 'LABELL2':0, 'LABELL3':0, 'LABELL4':0}
    for sent, label in zip(X_train, y_train):
        num_tokens_per_label[label] += len(sent)
        
    for k,v in num_tokens_per_label.items():
        num_tokens_per_label[k] -= 30000
    return num_tokens_per_label


In [36]:
model = Word2Vec()

model.build_vocab(sentences)


In [37]:
model.train(sentences)

In [38]:
print(model['Back'])
print(model['AFP'])
print(model['LABELL3'])
print(model['LABELL2'])
print(model['LABELL1'])
print(model['LABELL4'])

[ 0.1875403   0.85079667 -0.28322893  0.17550322  0.18167026  0.21235507
 -0.80905348  0.4594136   0.82261049  0.06364737]
[ 0.63512296  0.78475037  0.29449222  0.14347461 -0.18588998  0.85638598
 -0.63487745  0.72511304 -0.34685261  0.08899328]
[-0.09961668  0.77599518 -0.42756761  0.62209722 -0.00303212  0.25651261
  0.25629765  0.50532322 -0.07250288  1.34060398]
[-0.77072495 -0.03359329 -0.09970783  0.31654352 -0.09408384  0.37498167
  0.33337839  0.3618318   0.21767101 -0.64056409]
[ 0.22448335 -0.49242767  0.49578059 -0.72402306  0.10135646 -0.7433876
  0.14863216 -0.45336199  0.2743203   0.39611279]
[ 0.06792012 -0.24596423  0.19143705  0.84205213 -0.19670482 -0.19642943
  0.53341359  0.26626614 -0.19475743  0.34679409]


In [39]:
df2 = pd.read_csv('ag_news_csv/test.csv', delimiter=',',header=None, names=('Label', 'Title', 'Caption'))

X_test = df2[['Title', 'Caption']].apply(lambda x: list(tokenize(' '.join(x))), axis=1)
label_names = {1:'LABELL1', 2:'LABELL2', 3:'LABELL3', 4:'LABELL4'} 
y_test = df2['Label'].apply(lambda x: label_names[x])
    
X_test= X_test.apply(lambda x: [word if word in model.vocab else 'UNK' for word in x])
sentences = [row for row in X_train.head()]

In [40]:
L = label_names.values()
