In [17]:
from operator import itemgetter
from itertools import islice

import pandas as pd
import numpy as np

import gensim
from gensim.models import Word2Vec

from src.main import word_averaging
from src.main import word_averaging_list
from src.main import w2v_tokenise_text

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# restore objects and unpack them into variables
%store -r object_keep
df_bbc, list_categories, X, y, X_train, X_test, y_train, y_test = itemgetter('df_bbc',
                                                                             'list_categories',
                                                                             'X',
                                                                             'y',
                                                                             'X_train',
                                                                             'X_test',
                                                                             'y_train',
                                                                             'y_test')(object_keep)

## Logistic Regression with Word Embeddings
Thus far, have been rudimentarily counting words. Can compute word embeddings to get the relatedness of words. The point of *word embeddings* is that you can start considering the contexts of words more because you are seeking to understand words by the other words that it is surrounded by. In this regard, *word embeddings* belong to the text pre-processing stage. There are two predominant types we can choose from:
- Word2Vec
- GloVe

Will use the word2vec model by Google which is pre-trained on 100 billion words in the Google News corpus.

In [8]:
# load word2vec model
wv = gensim.models.KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin.gz", binary = True)
wv.init_sims(replace = True)

In [9]:
# explore some vocabularies
list(islice(wv.vocab, 13030, 13050))

['Memorial_Hospital',
 'Seniors',
 'memorandum',
 'elephant',
 'Trump',
 'Census',
 'pilgrims',
 'De',
 'Dogs',
 '###-####_ext',
 'chaotic',
 'forgive',
 'scholar',
 'Lottery',
 'decreasing',
 'Supervisor',
 'fundamentally',
 'Fitness',
 'abundance',
 'Hold']

We have a `word_averaging()` function which averages two word vectors. This is the common way to average two word vectors. 

More generally, Bag-of-Word (BOW)-based approaches includes averaging, summation, and weighted addition.

Also have created the `w2v_tokenise_text()` function which tokenises text. We will then apply this function onto the `article_text_clean` column. At this point, we will then apply word vector averaging to the tokenised text.

In [19]:
train, test = train_test_split(df_bbc[['article_text_clean', 'category']], test_size = 0.3, random_state = 42)

train_tokenised = train.apply(lambda r: w2v_tokenise_text(r['article_text_clean']),
                             axis = 1).values
test_tokenised = test.apply(lambda r: w2v_tokenise_text(r['article_text_clean']),
                           axis = 1).values

#X_train_word_average = word_averaging_list(wv, train_tokenised)
#X_test_word_average = word_averaging_list(wv, test_tokenised)

## Logistic Regression
Now let's see how the logistic regression classifier performs on these word-averaging document features.

In [32]:
model_logreg = LogisticRegression(multi_class = 'multinomial', n_jobs = 1, C = 1e5, max_iter = 4000)
model_logreg = model_logreg.fit(X_train_word_average, train['category'])
y_pred = model_logreg.predict(X_test_word_average)

In [33]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_pred, test['category'])))
print(classification_report(test['category'], y_pred, target_names = list_categories))

Accuracy: 0.572
               precision    recall  f1-score   support

entertainment       0.59      0.51      0.55       163
     business       0.57      0.43      0.49       120
        sport       0.60      0.61      0.60       112
     politics       0.55      0.78      0.65       148
         tech       0.56      0.50      0.53       125

     accuracy                           0.57       668
    macro avg       0.57      0.57      0.56       668
 weighted avg       0.57      0.57      0.57       668



Wow! Our accruacy has drastically fallen from 98.4% to 57.2%! That's pitiful!

In [None]:
object_keep = {'df_bbc': df_bbc,
               'list_categories': list_categories,
               'X': X,
               'y': y,
               'X_train': X_train,
               'X_test': X_test,
               'y_train': y_train,
               'y_test': y_test}
%store object_keep