In [1]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.rcParams["figure.dpi"] = 300
np.set_printoptions(precision=3, suppress=True)
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale, StandardScaler

In [2]:
from sklearn.datasets import load_files

reviews_train = load_files("../data/aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("text_train[1]:\n{}".format(text_train[1]))
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

type of text_train: <class 'list'>
length of text_train: 25000
text_train[1]:
b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were so stupid. "Executive Decisio

In [114]:
from sklearn.feature_extraction.text import CountVectorizer

text_train_sub, text_val, y_train_sub, y_val = train_test_split(
    text_train, y_train, stratify=y_train, random_state=0)
vect = CountVectorizer(min_df=2, stop_words="english")
X_train = vect.fit_transform(text_train_sub)
X_val = vect.transform(text_val)

In [115]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=.1).fit(X_train, y_train_sub)

In [116]:
lr.score(X_val, y_val)

0.88095999999999997

# Gensim

In [6]:
docs = ["What is my purpose", "You bring butter"]
texts = [[token for token in doc.lower().split()] for doc in docs]
print(texts)

[['what', 'is', 'my', 'purpose'], ['you', 'bring', 'butter']]


In [7]:
from gensim import corpora
dictionary = corpora.Dictionary(texts)
print(dictionary)

Dictionary(7 unique tokens: ['you', 'bring', 'butter', 'purpose', 'what']...)


In [8]:
new_doc = "what butter"
dictionary.doc2bow(new_doc.lower().split())

[(1, 1), (6, 1)]

In [9]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(4, 1), (5, 1), (6, 1)]]

In [10]:
import gensim
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(4, 1), (5, 1), (6, 1)]]

In [11]:
gensim.matutils.corpus2csc(corpus)

<7x2 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Column format>

In [12]:
X = CountVectorizer().fit_transform(docs)
X

<2x7 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [13]:
sparse_corpus = gensim.matutils.Sparse2Corpus(X.T)
print(sparse_corpus)
print(list(sparse_corpus))

<gensim.matutils.Sparse2Corpus object at 0x7fcfb38ae940>
[[(4, 1), (3, 1), (2, 1), (5, 1)], [(1, 1), (0, 1), (6, 1)]]


# Corpus transformations with gensim

In [14]:
tfidf = gensim.models.TfidfModel(corpus)
tfidf[corpus[0]]

[(0, 0.5), (1, 0.5), (2, 0.5), (3, 0.5)]

In [15]:
print(tfidf[corpus])
print(list(tfidf[corpus]))

<gensim.interfaces.TransformedCorpus object at 0x7fcfb38aee48>
[[(0, 0.5), (1, 0.5), (2, 0.5), (3, 0.5)], [(4, 0.5773502691896258), (5, 0.5773502691896258), (6, 0.5773502691896258)]]


# Word2Vec with gensim

In [17]:
from gensim import models
w = models.KeyedVectors.load_word2vec_format(
    '../GoogleNews-vectors-negative300.bin', binary=True)

In [18]:
w['queen'].shape

(300,)

In [23]:
w.syn0.shape

(3000000, 300)

In [65]:
vect_w2v = CountVectorizer(vocabulary=w.index2word)
vect_w2v.fit(text_train_sub)
docs = vect_w2v.inverse_transform(vect_w2v.transform(text_train_sub))
docs[0]

array(['in', 'for', 'that', 'is', 'the', 'at', 'not', 'as', 'it', 'by',
       'are', 'have', 'an', 'this', 'they', 'but', 'one', 'which', 'do',
       'than', 'over', 'just', 'some', 'like', 'only', 'did', 'because',
       'off', 'being', 'my', 'very', 'much', 'go', 'under', 'does', 'got',
       'top', 'come', 'really', 'lot', 'find', 'thing', 'once', 'offer',
       'feel', 'film', 'medical', 'terms', 'rather', 'certain', 'felt',
       'consider', 'watch', 'parts', 'heavy', 'towards', 'enjoy',
       'feeling', 'maybe', 'piece', 'fear', 'myself', 'stuff', 'handed',
       'brings', 'movies', 'hospitals', 'rare', 'lots', 'skin', 'eating',
       'intense', 'somewhat', 'liked', 'afraid', 'tribute', 'horror',
       'revenge', 'brave', 'whilst', 'sympathy', 'assaulted', 'satisfying',
       'hatred', 'viewer', 'awhile', 'pardon', 'delightful', 'disgust',
       'imitation', 'pudding', 'cringe', 'jaded', 'pun', 'pangs', 'doesn',
       'junctures', 'hellraiser', 'appologize'], 
      

In [54]:
X_train = np.vstack([np.mean(w[doc], axis=0) for doc in docs])

In [55]:
X_train.shape

(18750, 300)

In [56]:
docs_val = vect_w2v.inverse_transform(vect_w2v.transform(text_val))
X_val = np.vstack([np.mean(w[doc], axis=0) for doc in docs_val])

In [63]:
lr_w2v = LogisticRegression(C=100).fit(X_train, y_train_sub)
lr_w2v.score(X_train, y_train_sub)

0.86762666666666666

In [64]:
lr_w2f.score(X_val, y_val)

0.85711999999999999

In [69]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500).fit(X_train, y_train_sub)
rf.score(X_train, y_train_sub)

1.0

In [70]:
rf.score(X_val, y_val)

0.81599999999999995

# Semantic Arithmetic

In [19]:
w.most_similar(positive=['woman', 'king'], negative=['man'], topn=3)

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431607246399)]

In [20]:
w.most_similar(positive=['woman', 'he'], negative=['man'], topn=3)

[('she', 0.8492251634597778),
 ('She', 0.6329933404922485),
 ('her', 0.6029669046401978)]

In [21]:
w.most_similar(positive=['Germany', 'pizza'], negative=['Italy'], topn=3)

[('bratwurst', 0.5436394810676575),
 ('Domino_pizza', 0.5133179426193237),
 ('donuts', 0.5121968984603882)]

In [137]:
words_in_data = list(set(word for doc in docs for word in doc))

In [139]:
words_train = np.vstack(w[words_in_data])

In [140]:
words_train.shape

(45478, 300)

In [None]:
from sklearn.manifold import TSNE
words_train =TSNE().fit_transform(w.syn0)

# Doc2Vec with gensim
Also see https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb

In [71]:
def read_corpus(text, tokens_only=False):
    for i, line in enumerate(text):
        if tokens_only:
            yield gensim.utils.simple_preprocess(line)
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])


In [81]:
train_corpus = list(read_corpus(text_train_sub))
test_corpus = list(read_corpus(text_val, tokens_only=True))

[TaggedDocument(words=['maybe', 'it', 'just', 'because', 'have', 'an', 'intense', 'fear', 'of', 'hospitals', 'and', 'medical', 'stuff', 'but', 'this', 'one', 'got', 'under', 'my', 'skin', 'pardon', 'the', 'pun', 'this', 'piece', 'is', 'brave', 'not', 'afraid', 'to', 'go', 'over', 'the', 'top', 'and', 'as', 'satisfying', 'as', 'they', 'come', 'in', 'terms', 'of', 'revenge', 'movies', 'not', 'only', 'did', 'find', 'myself', 'feeling', 'lots', 'of', 'hatred', 'for', 'the', 'screwer', 'and', 'lots', 'of', 'sympathy', 'towards', 'the', 'screwee', 'felt', 'myself', 'cringe', 'and', 'feel', 'pangs', 'of', 'disgust', 'at', 'certain', 'junctures', 'which', 'is', 'really', 'rare', 'and', 'delightful', 'thing', 'for', 'somewhat', 'jaded', 'horror', 'viewer', 'like', 'myself', 'some', 'parts', 'are', 'very', 'reminiscant', 'of', 'hellraiser', 'but', 'come', 'off', 'as', 'tribute', 'rather', 'than', 'imitation', 'it', 'heavy', 'handed', 'piece', 'that', 'does', 'not', 'offer', 'the', 'viewer', 'muc

In [83]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)
model.build_vocab(train_corpus)

In [None]:
model.train(train_corpus, total_examples=model.corpus_count)

In [None]:
vectors = [model.infer_vector(train_corpus[doc_id].words)
          for doc_id in range(len(train_corpus))]    

In [None]:
X = np.vstack(vectors)