In [None]:
import sys
path = '/home/jupyter/site-packages/'
sys.path.append(path)
# !pip install theano
# !pip install keras
# !pip install nltk
# !pip install plotly
# import os
# os.environ['KERAS_BACKEND'] = 'theano'

In [None]:
import keras
import warnings
warnings.filterwarnings('ignore')


from IPython.display import SVG, HTML

import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline

py.sign_in(os.environ['SECRET_ENV_AARON_PLOTLY_USERNAME'], os.environ['SECRET_ENV_AARON_PLOT_API_KEY'])
offline.init_notebook_mode()

In [None]:
import spacy 
nlp = spacy.load('en')

text = u'Word vectors are fantastic!'
doc = nlp(text)
token = doc[1]
print token.vector[:25]

In [None]:
import numpy as np

average_of_token_vectors = np.mean([token.vector for token in doc],axis=0)
document_vector = doc.vector
assert all(average_of_token_vectors - document_vector == 0)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


#grab word vectors for each word
words = [u'cat',u'dog',u'man',u'woman']
vectors = map(lambda word: nlp(word).vector, words)

#create a dataframe of similarities
similarities = cosine_similarity(vectors)
similarity_matrix = pd.DataFrame(similarities, index = words, columns = words)


data = [go.Heatmap( z=similarity_matrix.T.values.tolist()
                   , colorscale='OrRd'
                   ,x = words
                   ,y = words
                  )]

layout = go.Layout(
    title='Similarity of Word Vectors'
)

fig = go.Figure(data=data, layout = layout)

py.iplot(fig)

In [None]:
#grab word vectors for each word
words = [u'man',u'men',u'woman',u'women']
vectors = map(lambda word: nlp(word).vector, words)
man,men,woman,women = vectors
attempted_women = men - man + woman



#plural_men = nlp(u'men').vector - nlp(u'man').vector
#plural_dogs = nlp(u'dogs').vector - nlp(u'dog').vector
#plural = (plural_men + plural_dogs) / 2

#vectors.append(plural + vectors[words.index('woman')])
vectors.append(attempted_women)
words.append('men - man + woman')

similarities = cosine_similarity(np.array(vectors))
similarity_matrix = pd.DataFrame(similarities, index = words, columns = words)

data = [go.Heatmap( z=similarity_matrix.T.values.tolist()
                   , colorscale='OrRd'
                   ,x = words
                   ,y = words
                  )]

layout = go.Layout(
    title='Men - Man + Woman ≈ Women'
)

fig = go.Figure(data=data, layout = layout)

py.iplot(fig)

In [None]:
import nltk
from sklearn.cross_validation import train_test_split
from collections import Counter
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

def nltk_corpus(corpus_name):
    corpus = getattr(nltk.corpus, corpus_name)
    try:
        corpus.ensure_loaded()
    except:
        nltk.download(corpus_name)
    return corpus

def corpus_to_x_y(corpus):
    fileids = corpus.fileids()
    tuples = map(lambda i: (" ".join(corpus.words(i)), corpus.categories(i)[0]),fileids)
    x, y = zip(*tuples)
    return x, y


#prepare data for classification
documents, categories = corpus_to_x_y(nltk_corpus('brown'))
documents, categories = shuffle(documents, categories)
encoder = LabelEncoder()
y = encoder.fit_transform(categories)

#Category Breakdown
c = Counter(categories)
for i in c:
    print i, c[i]

In [None]:
#classify using DTM
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import log_loss 
from sklearn.metrics import classification_report, f1_score, precision_score

f1_scores = {}
models = {}
losses = {}

def train_and_validate(name, model_classes,X_train, X_test, y_train, y_test):
    
    if name not in f1_scores:
        f1_scores[name] = {}
        
    if name not in models:
        models[name] = {}
        
    if name not in losses:
        losses[name] = {}        
        
    for model in model_classes:
        model.fit(X_train, y_train)    
        predictions = model.predict(X_test)
        probabilities = model.predict_proba(X_test)
    
        losses[name][model.__module__] = log_loss(y_test, probabilities)
        f1_scores[name][model.__module__] = f1_score(y_test, predictions, average = 'weighted')
        models[name][model.__module__] = model
        

In [None]:
#Create DTM


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split

doc_train, doc_test, y_train, y_test = train_test_split(documents, y, test_size = .33)

#create document term matrix with CountVectorizer
Vectorizer = CountVectorizer(stop_words='english')

#create training and testing DTM
X_train_dtm = Vectorizer.fit_transform(doc_train).todense()
X_test_dtm = Vectorizer.transform(doc_test).todense()

print "Shape of Document Term Matrix: {}".format(X_train_dtm.shape)

In [None]:
model_classes = [LogisticRegression()  
              , GaussianNB()
              , SVC(kernel='linear', probability=True)
              , RandomForestClassifier(n_estimators=100)
              , DummyClassifier()]

train_and_validate('DTM', model_classes, X_train_dtm, X_test_dtm, y_train, y_test)

In [None]:
model_names = f1_scores['DTM'].keys()

dtm_f1_trace = go.Bar(
                    y=[f1_scores['DTM'][model] for model in model_names],
                    x=model_names
)

layout = go.Layout(
    barmode='group', title='F1 Scores for Using Document Term Matrices'
)

fig = go.Figure(data=[dtm_f1_trace], layout = layout)
py.iplot(fig, filename='make-subplots')

In [None]:
X_train_doc_vec = np.array(map(lambda x: nlp(x, parse=False, entity=False).vector, doc_train))
X_test_doc_vec = np.array(map(lambda x: nlp(x,parse=False, entity=False).vector, doc_test))

print "Shape of Document Vector Matrix: {}".format(X_train_doc_vec.shape)

In [None]:
from sklearn.naive_bayes import BernoulliNB
model_classes = [LogisticRegression(C=100.)  
              , BernoulliNB()
              , SVC(kernel='linear', probability=True, C=100.)
              , RandomForestClassifier(n_estimators=100)
              , DummyClassifier()]

train_and_validate('WordVec', model_classes, X_train_doc_vec, X_test_doc_vec, y_train, y_test)

In [None]:
model_names = [i.__module__ for i in model_classes]

dtm_f1_trace = go.Bar(
                    y=[f1_scores['DTM'][model] for model in model_names],
                    x=model_names,
                    name = 'Document Term Matrix'
)

vect_f1_trace = go.Bar(
                    y=[f1_scores['WordVec'][model] for model in model_names],
                    x=model_names,
                    name = 'Word Vectors'
)

layout = go.Layout(
    barmode='group', title='F1 Scores for Using Document Term Matrices', yaxis=dict(title = 'F1 Score')
)

fig = go.Figure(data=[dtm_f1_trace, vect_f1_trace], layout = layout)
py.iplot(fig, filename='make-subplots')

In [None]:
new_doc = u"President Obama enacted sanctions on Russia"

#convert to vectors for prediction
dtm_vector = Vectorizer.transform([new_doc])
doc_vector = nlp(new_doc).vector

#predict using logistic model
dtm_predictions = models['DTM']['sklearn.linear_model.logistic'].predict_proba(dtm_vector)
docvec_predictions = models['WordVec']['sklearn.linear_model.logistic'].predict_proba(doc_vector)
docvec_predictions = pd.Series(docvec_predictions.T.reshape(15,), encoder.classes_)
dtm_predictions = pd.Series(dtm_predictions.T.reshape(15,), encoder.classes_)

#common order for models
order = docvec_predictions.sort_values().index.values


#plot
dtm_f1_trace = go.Bar(
                    y=dtm_predictions.loc[order],
                    x=order,
                    name = 'DTM Predicted'
)
vect_f1_trace = go.Bar(
                    y=docvec_predictions.loc[order],
                    x=order,
                    name = 'Word Vector Predicted'
)
layout = go.Layout(
    barmode='group', title='Predicted Classes of "%s"' % new_doc, yaxis=dict(title = 'P(category)')
)
fig = go.Figure(data=[dtm_f1_trace, vect_f1_trace], layout = layout)
py.iplot(fig, filename='make-subplots')

### Extra: Convolutional NN using Word Vectors

In [None]:
from keras.models import Model
from keras.layers import Embedding, LSTM, Dropout, Activation, Dense, Input, Conv1D, MaxPooling1D, Flatten

def docs_to_seqs(documents):

    tokenized_sents = []
    for doc in nlp.pipe(documents, parse = False, n_threads=8):
        tokenized_sents.append([i.lemma for i in doc])
        
    return tokenized_sents


def seq_to_ids_and_embeds(tokenized_sents, embedding_dim = 300):

    n_docs = len(tokenized_sents)
    max_len = max(map(len, tokenized_sents))
    all_words = [item for sublist in tokenized_sents for item in sublist]   
    n_words = len(set(all_words))
    unique_words = list(set(all_words))
    
    
    
    embedding_matrix = np.zeros((n_words, embedding_dim))
    embedding_count = 0
    lexid_2_embed_matrix = {}

    for lex_id in unique_words:
        embedding_vector = nlp.vocab[lex_id].vector
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[embedding_count] = embedding_vector
            lexid_2_embed_matrix[lex_id] = embedding_count
            embedding_count += 1

    keras_tokenized_sents = map(lambda sent: [lexid_2_embed_matrix[i] for i in sent],tokenized_sents)
    return keras_tokenized_sents, embedding_matrix, max_len, n_words


def conv_model_w_dropout(n_words, embedding_dim, embedding_matrix, max_len):
    embedding_layer = Embedding(n_words,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)
    input_ = Input(shape=(max_len,), dtype='int32')
    embedded_sequences = embedding_layer(input_)
    
    x = Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(35)(x) 
    x = Flatten()(x)
    x = Dropout(.5)(x)
    output_ = Dense(keras_train_y.shape[1], activation='softmax')(x)
    model = Model(input_ ,output_)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])    
    return model




from keras.preprocessing.sequence import pad_sequences
seqs = docs_to_seqs(documents)
embedding_dim = 300
keras_X, embedding, max_len, n_words = seq_to_ids_and_embeds(seqs, embedding_dim = embedding_dim)
keras_y = keras.utils.np_utils.to_categorical(y)

keras_train_y, keras_test_y = keras_y[:335], keras_y[335:]
keras_train_x = pad_sequences(keras_X[:335], maxlen=max_len, dtype='int32')
keras_test_x = pad_sequences(keras_X[335:], maxlen=max_len, dtype='int32')


model = conv_model_w_dropout(n_words, embedding_dim, embedding, max_len)
model.fit(keras_train_x,
          keras_train_y, 
          nb_epoch=5, 
          validation_data = (keras_test_x, keras_test_y))
test_predictions = model.predict(keras_test_x)

In [None]:
preds = map(np.argmax,test_predictions)
acts = map(np.argmax, keras_test_y)

In [None]:
from sklearn.metrics import classification_report
print classification_report(preds, acts)

In [None]:
train_predictions = model.predict(keras_train_x)
preds = map(np.argmax,train_predictions)
acts = map(np.argmax, keras_train_y)
print classification_report(preds, acts)