In [None]:


!pip install spacy
!python -m spacy.en.download all
!pip install nltk
!pip install gensim
!pip install theano
!pip install keras
import os
os.environ['KERAS_BACKEND'] = 'theano'

In [None]:

from keras.layers import Embedding, Input, Dense
from keras.models import Model, Sequential
import numpy as np
from scipy.special import expit
from scipy.stats import entropy
from collections import defaultdict

import nltk
nltk.download('punkt')
nltk.download('reuters')
import spacy
nlp = spacy.load('en', parse = False)

In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='test')
texts = map(lambda x: x.replace("\n",""), newsgroups_train.data[:1000])

In [None]:
def pick_value(token):
    if token.lemma_ in u'the it be for a of and but to in'.split():
        return ""
    if token.is_space:
        return ""
    elif token.like_url:
        return "URL" + token.whitespace_
    elif token.like_email:
        return "EMAIL" + token.whitespace_    
    elif token.like_num:
        return "NUM" + token.whitespace_  
    elif token.is_punct:
        return ""
    else:
        return token.lemma_ + token.whitespace_
    
tokenized_documents = []
for doc in nlp.pipe(texts, n_threads = 8):
    tokens = [pick_value(token) for token in doc]
    tokenized_documents.append(tokens)
    
processed_documents = [" ".join(doc) for doc in tokenized_documents]    
tokenized_documents = [doc.split() for doc in processed_documents]

In [None]:

frequency = defaultdict(int)
for text in tokenized_documents:
    for token in text:
        frequency[token] += 1
tokenized_documents = [[token for token in text if frequency[token] > 5] for text in tokenized_documents]        
processed_documents = [" ".join(doc) for doc in tokenized_documents]   

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils

def flatten(L):
    return [item for sublist in L for item in sublist]
    
def docsToIDMatrix(tokenized_documents, window = 3):
    
    processed_documents = [" ".join(doc) for doc in tokenized_documents]  
    
    #get statistics
    n_docs = len(tokenized_documents)
    all_words = flatten(tokenized_documents)
    unique_words = list(set(all_words))
    n_words = len(unique_words)

    #create token ids
    token_2_id = {j:i for i, j in enumerate(unique_words)}
    id_2_token = {i:j for i, j in enumerate(unique_words)}

    docs_as_ids = [[token_2_id[token] for token in doc ] for doc in tokenized_documents]
    
    return docs_as_ids, id_2_token, token_2_id


docs_as_ids, id_2_token, token_2_id = docsToIDMatrix(tokenized_documents)

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(tokenized_documents)
model.most_similar('president')

In [None]:
def flatten(L):
    return [item for sublist in L for item in sublist]


#get statistics
n_docs = len(processed_documents)
all_words = flatten(tokenized_documents)
unique_words = list(set(all_words))
n_words = len(unique_words)

#create token ids
token_2_id = {j:i for i, j in enumerate(unique_words)}
id_2_token = {i:j for i, j in enumerate(unique_words)}

docs_as_ids = [[token_2_id[token] for token in doc ] for doc in tokenized_documents]

### Questions for Learning Word Embedding Models:

1) What is a suitable "context"?

* Larger contexts $\rightarrow$ higher level relationships; thematically related
    * E.g Documents, paragraphs
* Smaller contexts $\rightarrow$ lower level relationships; synonyms
    * E.g Neighboring words, syntactic dependencies    
* Should we employ weights?
    * information-theoretic: tfidf, local/global
    * proximity-based

2) Unsupervised or supervised?

### Code Example: term-document LSA with local/global weighting

* Input: Term-Document Matrix
* Frequency Scaling: Log
* Document Frequency Scaling: Entropy
* Criteria for embedding size: pick K dimensions that explain at least 70% of overall variance

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
def build_document_count_matrix(documents):
    """
    Parameters
    ---------
    documents: list of strings
    """
    
    vectorizer = CountVectorizer(min_df=5)
    count_matrix = vectorizer.fit_transform(documents).todense()
    
    return vectorizer, count_matrix, vectorizer.get_feature_names()


def get_scaled_weights(count_matrix):
    """
    Parameters
    ---------
    count_matrix: numpy matrix of dim (documents, terms)
    """
    #number of words
    V = count_matrix.shape[1]
    
    #sublinear scaled count of term
    local_weights = np.log(count_matrix + 1)
    
    #entropy of term
    global_weights = np.array(1 + np.multiply(count_matrix + 1,np.log(count_matrix + 1)).sum(axis=0) / np.log(V))
    global_weights = global_weights.reshape(V)
    
    return local_weights / global_weights




vectorizer, count_matrix, vocabulary = build_document_count_matrix(processed_documents)
N = len(vocabulary)  

weights = get_scaled_weights(count_matrix)

svd = TruncatedSVD(n_components=N-1)
svd.fit_transform(weights)
embeddings = svd.components_.T

info_threshold = .7
weight_info = np.cumsum(svd.explained_variance_ratio_)
required_dims = np.where(weight_info < info_threshold)[0].max()
k_embeddings = embeddings[:, :required_dims]

print "Original Vocab: {}".format(N)
print "LSA embeddings dimension: {}".format(k_embeddings.shape[1])

### Determine most similar words with cosine similarity 

In [None]:
#compute vector similarities
from sklearn.metrics.pairwise import cosine_similarity
    
def most_similar(embedding_space, vocabulary, word, n = 10):
    assert word.lower() in vocabulary, "Word needs to be in vocabulary"
    
    word_idx = vocabulary.index(word)
    word_vector = np.expand_dims(embedding_space[word_idx], axis=0)
    
    sim_matrix = cosine_similarity(X = embedding_space, Y = word_vector).reshape(embedding_space.shape[0])
    most_similar = np.argsort(sim_matrix)[::-1][:n]

    for idx in most_similar:
        yield vocabulary[idx]
        
for i in most_similar(k_embeddings, vocabulary, 'bank'):
    print i

### Neural Language Models

In [None]:
import logging

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

class WordEmbeddingModel(object):
    
    """WARNING: INEFFICIENT! Just for demonstration purposes."""
    
    def __init__(self, tokenized_documents, context_window = 3, embedding_size = 100, n_epochs = 5, learning_rate = 0.1, verbose_mode = 'info'):
        self.n_epochs = n_epochs
        self.learning_rate = learning_rate
        self.verbose_mode = verbose_mode
        self.logger = self._build_logger()
        self.window = context_window
        
        #document processing
        self.docs_as_ids, self.id_2_token, self.token_2_id = docsToIDMatrix(tokenized_documents)
        flattened_docs = flatten(self.docs_as_ids)
        self.vocab_size = len(set(flattened_docs))
        self.embedding_size = embedding_size
        self.max_seq_length = 2 * self.window
        self.n_obs = float(len(flattened_docs))
        
        #initialize data structures for weights
        self.projection_matrix = np.random.randn(self.vocab_size, self.embedding_size)
        self.prediction_matrix = np.random.randn(self.embedding_size, self.vocab_size)
        self.input_matrix = np.diag(np.ones(self.vocab_size)) #convert ids to one hot encodings

    def _build_logger(self):
        if self.verbose_mode == 'info':
            self.verbose = logging.INFO
        elif self.verbose_mode == 'debug':
            self.verbose = logging.DEBUG
        else:
            self.verbose = logging.INFO
        logger = logging.Logger('w2v-log')
        ch = logging.StreamHandler(sys.stdout)
        ch.setLevel(self.verbose)
        formatter = logging.Formatter('%(asctime)s - %(message)s')
        ch.setFormatter(formatter)
        logger.addHandler(ch)
        return logger
    
    
    def update_step(self, word, context):
        
        #feed forward
        
        projection = np.mean(self.projection_matrix[context], axis=0)
        prediction = softmax(np.dot(projection, self.prediction_matrix))        
        target_vector = self.input_matrix[word]
        
        #calculate delta for prediction weight matrix
        output_delta = (prediction - target_vector).reshape(self.vocab_size, 1) # (vocab, 1)
        prediction_delta = np.dot(projection.reshape(self.embedding_size, 1), output_delta.T) #(embbeding, vocab)
    
        #adjust prediction weight matrix
        self.prediction_matrix = self.prediction_matrix - (self.learning_rate * prediction_delta)
        hidden_delta = np.einsum('j,ij->i',output_delta.reshape(self.vocab_size), self.prediction_matrix)

        for k in context:
            self.projection_matrix[k] = self.projection_matrix[k] - (self.learning_rate * hidden_delta)/float(len(context))

                        
    def train(self):
        for epoch in xrange(self.n_epochs):
            self.logger.info("Epoch %s" % epoch)
            
            for obs_i, sentence in enumerate(self.docs_as_ids):
                self.logger.debug("Epoch %s Obs %s" % (epoch, obs_i))
                
                text_length = len(sentence)
                for token_idx in range(text_length):
                    word = sentence[token_idx] #word as absolute id
                    context_idx = range(max(token_idx - self.window, 0), min(token_idx + self.window, text_length-1)) #context as relative ids
                    context = [sentence[i] for i in context_idx] #content as absolute ids
                    self.update_step(word, context)
                    
                    
                    
    def most_similar(self, word,n=10):
        assert word.lower() in self.token_2_id.keys(), "%s not found in vocabulary" % word
            
        idx = self.token_2_id[word.lower()]
        vec = self.projection_matrix[idx]
        sims = cosine_similarity(X=self.projection_matrix, Y=vec)
        
        sims = sims.reshape(sims.shape[0])
        
        most_sims = np.argsort(sims)[::-1][:n]
        words = [self.id_2_token[i] for i in most_sims]        
        return words
        
#print prediction_delta.shape, self.prediction_matrix.shape
m = WordEmbeddingModel(tokenized_documents, verbose_mode='info')
m.train()
m.most_similar(u'curious')


### lsa with word context

In [None]:
###lsa with word context
#create empty matrix for term context
n_words = len(vocabulary)
token_2_id = {j:i for i, j in enumerate(vocabulary)}
id_2_token = {i:j for i, j in enumerate(vocabulary)}
term_frequency = np.zeros((n_words, n_words))

#set window size
window = 5
for text in tokenized_documents:
    text = filter(lambda x: x in vocabulary, text)
    text_length = len(text)
    for token_idx in range(text_length):
        row_index = token_2_id[text[token_idx]]
        context_idx = (max(token_idx - window, 0), min(token_idx + window, text_length-1))
        for idx in range(*context_idx):
            column_index = token_2_id[text[idx]]
            term_frequency[row_index, column_index] += 1           
            
#tfidf weighting
context_frequency = np.log(((term_frequency > 1).astype(float)).sum(axis=1) + 1 / float(n_words))
term_context_matrix = term_frequency / context_frequency


#build low dimensional representation
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=20)
svd.fit(doc_context_matrix)
embeddings = svd.components_.T



from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(embeddings)
word_idx = unique_words.index('bank')
row = sim_matrix[word_idx]
most_similar = np.argsort(row)[::-1]

for idx in most_similar[0:10]:
    print unique_words[idx]