In [1]:
import os
import numpy as np
from nltk.corpus import stopwords
from string import punctuation
from gensim.models import word2vec

In [2]:
os.getcwd()

'/content'

### Loading the training and test data sets

In [12]:
#This is needed only if you have uploaded data to Google drive
from google.colab import drive
drive.mount('/gdrive')
train_data = "/gdrive/My Drive/Statistical NLP AIML/r8-train-all-terms.txt"
test_data = "/gdrive/My Drive/Statistical NLP AIML/r8-test-all-terms.txt"

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [13]:
X, y = [], []
with open(train_data, "r") as infile:
    for line in infile:
        label, text = line.split("\t")
        X.append(text.split())
        y.append(label)
X, y = np.array(X), np.array(y)
print ("total examples %s" % len(y))

total examples 5485


  import sys


In [14]:
X_test, y_test = [], []
with open(test_data, "r") as infile:
    for line in infile:
        label, text = line.split("\t")
        X_test.append(text.split())
        y_test.append(label)
X_test, y_test = np.array(X_test), np.array(y_test)
print ("total examples %s" % len(y_test))

total examples 2189


  import sys


In [15]:
X.shape, X_test.shape

((5485,), (2189,))

In [16]:
np.unique(y, return_counts=True)

(array(['acq', 'crude', 'earn', 'grain', 'interest', 'money-fx', 'ship',
        'trade'], dtype='<U8'),
 array([1596,  253, 2840,   41,  190,  206,  108,  251]))

## Using NB methods

First, reconstructring the string text for our vectorizers to work

In [17]:
X_text = [" ".join(val) for val in X]
print(X_text[0] + "\n")
X_test_text = [" ".join(val) for val in X_test]
print(X_test_text[10])

champion products ch approves stock split champion products inc said its board of directors approved a two for one stock split of its common shares for shareholders of record as of april the company also said its board voted to recommend to shareholders at the annual meeting april an increase in the authorized capital stock from five mln to mln shares reuter

vieille montagne reports loss dividend nil year net loss after exceptional charges mln francs vs profit mln exceptional provisions for closure of viviez electrolysis plant mln francs vs exceptional gain mln sales and services billion francs vs billion proposed net dividend on ordinary shares nil vs francs company s full name is vieille montagne sa vmnb br reuter


#### Using count vectorizer

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
vect = CountVectorizer(stop_words='english',max_features=5000)
vect.fit(X_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=5000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

Applying the vectorizer to our test and train sets

In [20]:
X_train_transformed = vect.transform(X_text)
X_test_transformed =vect.transform(X_test_text)

In [21]:
# printing the vocabulary
list(vect.vocabulary_.items())[:10]

[('champion', 759),
 ('products', 3488),
 ('approves', 264),
 ('stock', 4337),
 ('split', 4273),
 ('said', 3974),
 ('board', 532),
 ('directors', 1327),
 ('approved', 263),
 ('common', 895)]

### Using Bernoulli NB first

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train_transformed,y)

# predict class
pred_train_ys = bnb.predict(X_train_transformed)
pred_test_ys = bnb.predict(X_test_transformed)

# accuracy
print("Train accuracy: ", accuracy_score(y, pred_train_ys))
print("Test accuracy: ", accuracy_score(y_test, pred_test_ys))

Train accuracy:  0.8736554238833182
Test accuracy:  0.8688899040657835


### Using Multinomial NB
 - We expect this to work very well, giving high performance in accuracy

In [24]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

In [25]:
#fit on training data
mnb.fit(X_train_transformed, y)

# predict class
pred_train_ys = mnb.predict(X_train_transformed)
pred_test_ys = mnb.predict(X_test_transformed)

# accuracy
print("Train accuracy: ", accuracy_score(y, pred_train_ys))
print("Test accuracy: ", accuracy_score(y_test, pred_test_ys))

Train accuracy:  0.968094804010939
Test accuracy:  0.9657377798081316


 - As expected, this performed really well
 - Remember that we used 5000 features!

## Using our word embeddings approach

 We have two options here:
 
1. Use pre-trained word vectors (Glove)

2. Train our own vectors

We'll explore both

### Loading the pre-trained GloVe word vectors

In [26]:
# from gensim.scripts.glove2word2vec import glove2word2vec
# glove_input_file = 'glove.6B.100d.txt'
# word2vec_output_file = 'glove.6B.100d.w2vformat.txt'
# glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.200d.txt'
word2vec_output_file = 'glove.6B.200d.w2vformat.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 200)

In [None]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("glove.6B.200d.w2vformat.txt", binary=False)

### Sentence vector by averaging word vectors

In [None]:
stop_words = stopwords.words("english")

In [None]:
def sent_vec(sent):
    wv_res = np.zeros(glove_model.vector_size)
    ctr = 1
    for w in sent:
        if w in glove_model:
            ctr += 1
            wv_res += glove_model[w]
    wv_res = wv_res/ctr
    #return (wv_res, ctr)
    return wv_res

In [None]:
train_doc_vecs = []
for doc in X:    
    doc_words = [term for term in doc if term not in stop_words]
    train_doc_vecs.append(sent_vec(doc_words))

In [None]:
test_doc_vecs = []
for doc in X_test:    
    doc_words = [term for term in doc if term not in stop_words]
    test_doc_vecs.append(sent_vec(doc_words))

### Building a predictive model on the averaged word vectors

#### Using a 'simple' logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression(penalty="l1", random_state=42, C = 3.5)
logreg.fit(train_doc_vecs,y)

LogisticRegression(C=3.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
pred_train_ys = logreg.predict(train_doc_vecs)
pred_test_ys = logreg.predict(test_doc_vecs)
print("Train accuracy: ", accuracy_score(pred_train_ys, y))
print("Test accuracy: ", accuracy_score(pred_test_ys, y_test))

Train accuracy:  0.978122151322
Test accuracy:  0.968021927821


### Training our own wordvectors on the data
We'll create a combined text file to train our word vectors - more data is better. Although in this case we would still have just 7.7K instances to learn from.

In [None]:
X_comb = np.concatenate([X,X_test])
len(X_comb)

7674

In [None]:
print(X_comb[6000])

['fhlbb', 'changes', 'short', 'term', 'discount', 'note', 'rates', 'the', 'federal', 'home', 'loan', 'bank', 'board', 'adjusted', 'the', 'rates', 'on', 'its', 'short', 'term', 'discount', 'notes', 'as', 'follows', 'maturity', 'new', 'rate', 'old', 'rate', 'maturity', 'days', 'pct', 'pct', 'days', 'days', 'pct', 'pct', 'days', 'days', 'pct', 'pct', 'days', 'days', 'pct', 'pct', 'days', 'days', 'pct', 'pct', 'days', 'reuter']


In [None]:
w2v = word2vec.Word2Vec(X_comb, window=2, min_count=2, sg = 1, size=200)

In [None]:
w2v.most_similar("future", topn=5)

[('near', 0.7925878763198853),
 ('prospects', 0.7876253128051758),
 ('method', 0.7740976810455322),
 ('developments', 0.7626558542251587),
 ('foreseeable', 0.7616651058197021)]

#### Sentence vectors by averaging vectors for words

In [None]:
def sent_vec_w2v(sent):
    wv_res = np.zeros(w2v.vector_size)
    ctr = 1
    for w in sent:
        if w in w2v:
            ctr += 1
            wv_res += w2v[w]
    wv_res = wv_res/ctr
    return wv_res

#### Getting the sentence vectors for the test and train sets

In [None]:
train_doc_vecs = []
for doc in X:    
    doc_words = [term for term in doc if term not in stop_words]
    train_doc_vecs.append(sent_vec_w2v(doc_words))

In [None]:
test_doc_vecs = []
for doc in X_test:    
    doc_words = [term for term in doc if term not in stop_words]
    test_doc_vecs.append(sent_vec_w2v(doc_words))

#### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression(penalty="l1", random_state=42, C = 8)

In [None]:
logreg.fit(train_doc_vecs,y)

LogisticRegression(C=8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
pred_train_ys = logreg.predict(train_doc_vecs)
pred_test_ys = logreg.predict(test_doc_vecs)
print("Train accuracy: ", accuracy_score(pred_train_ys, y))
print("Test accuracy: ", accuracy_score(pred_test_ys, y_test))

Train accuracy:  0.971011850501
Test accuracy:  0.966194609411
