In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import numpy as np
from nltk.corpus import stopwords
from string import punctuation
from gensim.models import word2vec

### Loading the training and test data sets

In [1]:
train_data = "r8-train-all-terms.txt"
test_data = "r8-test-all-terms.txt"

In [2]:
import pandas as pd

In [3]:
inp_train = pd.read_table(train_data, sep="\t", names=['label','text'])
inp_train.head()

FileNotFoundError: [Errno 2] No such file or directory: 'r8-train-all-terms.txt'

In [6]:
inp_train.shape

(5485, 2)

In [7]:
inp_test = pd.read_table(test_data, sep="\t", names=['label','text'])
inp_test.head()

Unnamed: 0,label,text
0,trade,asian exporters fear damage from u s japan rif...
1,grain,china daily says vermin eat pct grain stocks a...
2,ship,australian foreign ship ban ends but nsw ports...
3,acq,sumitomo bank aims at quick recovery from merg...
4,earn,amatil proposes two for five bonus share issue...


In [18]:
inp_test.shape

(2189, 2)

#### What labels do we have in the data?

In [153]:
inp_train.label.value_counts(normalize=True) * 100

earn        51.777575
acq         29.097539
crude        4.612580
trade        4.576117
money-fx     3.755697
interest     3.463993
ship         1.969006
grain        0.747493
Name: label, dtype: float64

## Using NB methods

First, reconstructring the string text for our vectorizers to work

In [7]:
X_text = [" ".join(val) for val in X]
print(X_text[0] + "\n")
X_test_text = [" ".join(val) for val in X_test]
print(X_test_text[10])

champion products ch approves stock split champion products inc said its board of directors approved a two for one stock split of its common shares for shareholders of record as of april the company also said its board voted to recommend to shareholders at the annual meeting april an increase in the authorized capital stock from five mln to mln shares reuter

vieille montagne reports loss dividend nil year net loss after exceptional charges mln francs vs profit mln exceptional provisions for closure of viviez electrolysis plant mln francs vs exceptional gain mln sales and services billion francs vs billion proposed net dividend on ordinary shares nil vs francs company s full name is vieille montagne sa vmnb br reuter


#### Using count vectorizer

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
vect = CountVectorizer(stop_words='english',max_features=5000)
vect.fit(inp_train.text.values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Applying the vectorizer to our test and train sets

In [23]:
X_train_transformed = vect.transform(inp_train.text.values)
X_test_transformed = vect.transform(inp_test.text.values)

In [24]:
# printing the vocabulary
list(vect.vocabulary_.items())[:10]

[('champion', 759),
 ('products', 3488),
 ('approves', 264),
 ('stock', 4337),
 ('split', 4273),
 ('said', 3974),
 ('board', 532),
 ('directors', 1327),
 ('approved', 263),
 ('common', 895)]

### Using Bernoulli NB first

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train_transformed,y)

# predict class
pred_train_ys = bnb.predict(X_train_transformed)
pred_test_ys = bnb.predict(X_test_transformed)

# accuracy
print("Train accuracy: ", accuracy_score(y, pred_train_ys))
print("Test accuracy: ", accuracy_score(y_test, pred_test_ys))

Train accuracy:  0.8736554238833182
Test accuracy:  0.8688899040657835


### Using Multinomial NB
 - We expect this to work very well, giving high performance in accuracy

In [27]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

In [28]:
#fit on training data
mnb.fit(X_train_transformed, y)

# predict class
pred_train_ys = mnb.predict(X_train_transformed)
pred_test_ys = mnb.predict(X_test_transformed)

# accuracy
print("Train accuracy: ", accuracy_score(y, pred_train_ys))
print("Test accuracy: ", accuracy_score(y_test, pred_test_ys))

Train accuracy:  0.968094804010939
Test accuracy:  0.9657377798081316


 - As expected, this performed really well
 - Remember that we used 5000 features!

## Using our word embeddings approach

 We have two options here:
 
1. Use pre-trained word vectors (Glove)

2. Train our own vectors

We'll explore both

### Loading the pre-trained GloVe word vectors

In [40]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.200d.txt'
word2vec_output_file = 'glove.6B.200d.w2vformat.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 200)

In [41]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("glove.6B.200d.w2vformat.txt", binary=False)

### Getting data in a format conducive for our tasks

In [12]:
X_train = [doc.split() for doc in inp_train.text.values]
y_train = inp_train.label.values

In [19]:
X_test = [doc.split() for doc in inp_test.text.values]
y_test = inp_test.label.values

In [163]:
print(X_test[1])

['china', 'daily', 'says', 'vermin', 'eat', 'pct', 'grain', 'stocks', 'a', 'survey', 'of', 'provinces', 'and', 'seven', 'cities', 'showed', 'vermin', 'consume', 'between', 'seven', 'and', 'pct', 'of', 'china', 's', 'grain', 'stocks', 'the', 'china', 'daily', 'said', 'it', 'also', 'said', 'that', 'each', 'year', 'mln', 'tonnes', 'or', 'pct', 'of', 'china', 's', 'fruit', 'output', 'are', 'left', 'to', 'rot', 'and', 'mln', 'tonnes', 'or', 'up', 'to', 'pct', 'of', 'its', 'vegetables', 'the', 'paper', 'blamed', 'the', 'waste', 'on', 'inadequate', 'storage', 'and', 'bad', 'preservation', 'methods', 'it', 'said', 'the', 'government', 'had', 'launched', 'a', 'national', 'programme', 'to', 'reduce', 'waste', 'calling', 'for', 'improved', 'technology', 'in', 'storage', 'and', 'preservation', 'and', 'greater', 'production', 'of', 'additives', 'the', 'paper', 'gave', 'no', 'further', 'details', 'reuter']


### Sentence vector by averaging word vectors

In [42]:
stop_words = stopwords.words("english")

In [43]:
def sent_vec(sent):
    wv_res = np.zeros(glove_model.vector_size)
    ctr = 1
    for w in sent:
        if w in glove_model:
            ctr += 1
            wv_res += glove_model[w]
    wv_res = wv_res/ctr
    #return (wv_res, ctr)
    return wv_res

In [44]:
train_doc_vecs = []
for doc in X:    
    doc_words = [term for term in doc if term not in stop_words]
    train_doc_vecs.append(sent_vec(doc_words))

In [45]:
test_doc_vecs = []
for doc in X_test:    
    doc_words = [term for term in doc if term not in stop_words]
    test_doc_vecs.append(sent_vec(doc_words))

### Building a predictive model on the averaged word vectors

#### Using a 'simple' logistic regression

In [46]:
from sklearn.linear_model import LogisticRegression

In [47]:
logreg = LogisticRegression(penalty="l1", random_state=42, C = 3.5)
logreg.fit(train_doc_vecs,y)

LogisticRegression(C=3.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [48]:
pred_train_ys = logreg.predict(train_doc_vecs)
pred_test_ys = logreg.predict(test_doc_vecs)
print("Train accuracy: ", accuracy_score(pred_train_ys, y))
print("Test accuracy: ", accuracy_score(pred_test_ys, y_test))

Train accuracy:  0.9781221513217867
Test accuracy:  0.9680219278209228


### Training our own word vectors on the data
We'll create a combined text file to train our word vectors - more data is better. Although in this case we would still have just 7.7K instances to learn from.

In [49]:
X_comb = np.concatenate([X, X_test])
len(X_comb)

7674

In [50]:
print(X_comb[6000])

['fhlbb', 'changes', 'short', 'term', 'discount', 'note', 'rates', 'the', 'federal', 'home', 'loan', 'bank', 'board', 'adjusted', 'the', 'rates', 'on', 'its', 'short', 'term', 'discount', 'notes', 'as', 'follows', 'maturity', 'new', 'rate', 'old', 'rate', 'maturity', 'days', 'pct', 'pct', 'days', 'days', 'pct', 'pct', 'days', 'days', 'pct', 'pct', 'days', 'days', 'pct', 'pct', 'days', 'days', 'pct', 'pct', 'days', 'reuter']


In [76]:
%%time
w2v = word2vec.Word2Vec(X_comb, window=5, min_count=2, sg = 1, size=200)

Wall time: 9.13 s


In [77]:
w2v.wv.most_similar("future", topn=5)

[('structure', 0.7999764680862427),
 ('foreseeable', 0.7898048162460327),
 ('internal', 0.7653458118438721),
 ('effects', 0.7544820308685303),
 ('course', 0.7535136938095093)]

#### Sentence vectors by averaging vectors for words

In [78]:
def sent_vec_w2v(sent):
    wv_res = np.zeros(w2v.vector_size)
    ctr = 1
    for w in sent:
        if w in w2v.wv:
            ctr += 1
            wv_res += w2v.wv[w]
    wv_res = wv_res/ctr
    return wv_res

#### Getting the sentence vectors for the test and train sets

In [79]:
train_doc_vecs = []
for doc in X:    
    doc_words = [term for term in doc if term not in stop_words]
    train_doc_vecs.append(sent_vec_w2v(doc_words))

In [80]:
test_doc_vecs = []
for doc in X_test:    
    doc_words = [term for term in doc if term not in stop_words]
    test_doc_vecs.append(sent_vec_w2v(doc_words))

#### Logistic regression

In [81]:
from sklearn.linear_model import LogisticRegression

In [88]:
logreg = LogisticRegression(penalty="l1", random_state=42, C = 7)

In [89]:
logreg.fit(train_doc_vecs,y)

LogisticRegression(C=7, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [90]:
pred_train_ys = logreg.predict(train_doc_vecs)
pred_test_ys = logreg.predict(test_doc_vecs)
print("Train accuracy: ", accuracy_score(pred_train_ys, y))
print("Test accuracy: ", accuracy_score(pred_test_ys, y_test))

Train accuracy:  0.9735642661804923
Test accuracy:  0.9712197350388305


## Doc2vec 
- we combined individual terms to form the document/sentence vector
- what if we could learn the embedding for the sentence in the way we did for the words?

How it works - 
https://arxiv.org/pdf/1405.4053v2.pdf

In [91]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [92]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [94]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [111]:
vector = model.infer_vector(["trees"])

In [112]:
model.docvecs.most_similar([vector])

[(0, 0.7911380529403687),
 (7, 0.7751397490501404),
 (5, 0.6241987943649292),
 (3, 0.3657190799713135),
 (1, -0.057240258902311325),
 (6, -0.09942742437124252),
 (2, -0.33615532517433167),
 (8, -0.565274178981781),
 (4, -0.7416948676109314)]

In [102]:
model.docvecs.most_similar([1])

[(3, 0.6897048950195312),
 (4, 0.4533742666244507),
 (0, 0.36425623297691345),
 (6, 0.24099451303482056),
 (8, 0.22011038661003113),
 (5, 0.026929767802357674),
 (7, -0.2675256133079529),
 (2, -0.5373554229736328)]

In [116]:
model[0]

array([ 0.07705729, -0.02333286, -0.09770483,  0.0458269 , -0.02320459],
      dtype=float32)

### Doc2vec on our data

In [140]:
?Doc2Vec

In [139]:
%%time
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train)]
model = Doc2Vec(documents, vector_size = 100, window=5, min_count=5, workers=4,epochs=50)

Wall time: 23.2 s


In [149]:
" ".join(X_train[20])

'gulf barge freight rates up further on call gulf barge freight rates firmed again on the outlook for steady vessel loadings at the gulf increasing the demand for barges to supply those ships dealers said no barges traded today on the st louis merchants exchange call session versus yesterday quotes included delivery this week on the illinois river joliet pct of tariff bid offered with next week same river ex chicago quoted the same both up percentage points next week mississippi river st louis pct bid offered up five points next week ohio river owensboro south pct bid offered up points on station illinois river south chicago pct bid offered no comparison march illinois ex chicago pct bid offered up points march ohio river bid at yesterday s traded level of pct offered at march lower mississippi river memphis cairo pct bid offered no comparison may illinois river ex chicago pct bid offered no comparison sept nov lower mississippi river memphis cairo pct bid offered with sept dec same se

In [150]:
model.docvecs.most_similar(20)

[(5026, 0.8198413848876953),
 (4687, 0.5485634207725525),
 (1622, 0.5247182846069336),
 (3267, 0.5225585699081421),
 (4040, 0.515109121799469),
 (4041, 0.5137314796447754),
 (718, 0.5006198883056641),
 (3331, 0.4968649744987488),
 (1733, 0.49379822611808777),
 (1853, 0.4937150478363037)]

In [151]:
" ".join(X_train[5026])

'gulf barge freight higher in nearbys on call gulf barge freight rates continued to show a firmer tone in the nearbys on the assumption that changes in the gulf posted prices will encourage increases in both pik and roll activity and barges shipments with a total of barges traded this morning on the st louis merchants exchange call session versus nine yesterday dealers said quotes included this week mississippi river granite city mla if p o traded at pct of tariff five percentage points above yesterday s bid next week mississippi alton granite city mla if p o bid five points higher at pct offered at five barges each week april midmississippi river dubuque south traded at yesterday s bid of pct april illinois river ex chicago pct bid offered points higher at may same section pct bid offered five lower at may midmississippi river dubuque south bid points higher at pct offered at june july lower mississippi river memphis cairo offered at pct no bids june aug upper mississippi river lacros

**Note:**

More detailed and sophisticated example on doc2vec available on Gensim git

https://github.com/RaRe-Technologies/gensim/blob/3c3506d51a2caf6b890de3b1b32a8b85f7566ca5/docs/notebooks/doc2vec-IMDB.ipynb