In [1]:
from operator import itemgetter

import pandas as pd
import numpy as np

from tqdm import tqdm

from gensim.models import Doc2Vec

from sklearn import utils

import re

from src.main import label_sentences
from src.main import get_vectors

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# restore objects and unpack them into variables
%store -r object_keep
df_bbc, list_categories, X, y, X_train, X_test, y_train, y_test = itemgetter('df_bbc',
                                                                             'list_categories',
                                                                             'X',
                                                                             'y',
                                                                             'X_train',
                                                                             'X_test',
                                                                             'y_train',
                                                                             'y_test')(object_keep)

## Logistic Regression with Document Embeddings
Previously, we used the Word2Vec method to compute word vectors. Here, we obtained the mathematical average of the word vector representations for all the words in each document. What we want to do now is take this idea but apply it at the document level, where instead of capturing the relationship between words, we want to capture the relationship between documents.

To train a Doc2Vec model, we will take a similar approach to what was done [here](https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4), namely:

1. Label the sentences
    + This is because Gensim's implementation of Doc2Vec requires each document/paragraph to have a label associated to it.
    + Will achieve by using the `TaggedDocument` method.
1. The format will be `TRAIN_i` and `TEST_i` where `i` represents the dummy index of the text.

In [2]:
X_train, X_test, y_train, y_test = train_test_split(df_bbc['article_text_clean'], 
                                                    df_bbc['category'],
                                                    random_state = 42,
                                                    test_size = 0.3)
X_train = label_sentences(corpus = X_train, label_type = 'Train')
X_test = label_sentences(corpus = X_test, label_type = 'Test')

In [3]:
# views some docs in our train set
data_all = X_train + X_test
data_all[:2]

[TaggedDocument(words=['malik', 'rejects', 'black', 'mp', 'lists', 'call', 'ethnic', 'minority', 'shortlists', 'boost', 'number', 'black', 'asian', 'mps', 'rejected', 'one', 'labour', 'senior', 'asians', 'shahid', 'malik', 'labour', 'ruling', 'nec', 'accepted', 'people', 'frustration', 'said', 'targets', 'lists', 'boost', 'representation', 'minorities', '13', 'britain', '659', 'mps', 'ethnic', 'minority', 'groups', 'added', 'commission', 'racial', 'equality', 'chief', 'trevor', 'phillips', 'argued', 'sunday', 'time', 'come', 'shortlists', 'came', 'emerged', 'one', 'britain', 'ethnically', 'diverse', 'constituency', 'west', 'ham', 'get', 'women', 'shortlist', 'next', 'election', 'following', 'nec', 'ruling', 'mr', 'phillips', 'said', 'changes', 'race', 'relations', 'legislation', 'might', 'allow', 'political', 'parties', 'reserve', 'seats', 'represented', 'groups', 'example', 'west', 'ham', 'might', 'allow', 'women', 'minorities', 'seek', 'candidates', 'get', 'side', 'general', 'electio

When training Doc2Vec, will vary the following parameters:

- `dm = 0`: Distributed Bag of Words (DBOW) is used.
- `vector_size = 300`: 300 dimensional feature vectors.
- `negative = 5`: specifies how many *noise* words should be drawn
- `min_count = 1`: ignores all word with total frequencies less than this
- `alpha = 0.065`: the inital learning rate

Initialise the model and train for 30 epochs.

In [4]:
model_dbow = Doc2Vec(dm = 0, vector_size = 300, negative = 5, min_count = 1, alpha = 0.065)
model_dbow.build_vocab([x for x in tqdm(data_all)])

100%|██████████| 2225/2225 [00:00<00:00, 1535679.84it/s]


In [5]:
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(data_all)]),
                     total_examples = len(data_all),
                     epochs = 1)
    model_dbow.alpha -= 0.02
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 2225/2225 [00:00<00:00, 1662923.45it/s]
100%|██████████| 2225/2225 [00:00<00:00, 1771512.22it/s]
100%|██████████| 2225/2225 [00:00<00:00, 1898743.93it/s]
100%|██████████| 2225/2225 [00:00<00:00, 2371620.43it/s]
100%|██████████| 2225/2225 [00:00<00:00, 1924984.82it/s]
100%|██████████| 2225/2225 [00:00<00:00, 1921022.31it/s]
100%|██████████| 2225/2225 [00:00<00:00, 2351896.77it/s]
100%|██████████| 2225/2225 [00:00<00:00, 2272851.05it/s]
100%|██████████| 2225/2225 [00:00<00:00, 1903002.94it/s]
100%|██████████| 2225/2225 [00:00<00:00, 2315139.27it/s]
100%|██████████| 2225/2225 [00:00<00:00, 1923001.52it/s]
100%|██████████| 2225/2225 [00:00<00:00, 1916288.79it/s]
100%|██████████| 2225/2225 [00:00<00:00, 1938580.47it/s]
100%|██████████| 2225/2225 [00:00<00:00, 2191715.92it/s]
100%|██████████| 2225/2225 [00:00<00:00, 1427179.45it/s]
100%|██████████| 2225/2225 [00:00<00:00, 1915109.05it/s]
100%|██████████| 2225/2225 [00:00<00:00, 2274512.89it/s]
100%|██████████| 2225/2225 [00:

Extract vectors from trained model.

In [7]:
train_vectors_dbow = get_vectors(model = model_dbow, 
                                 corpus_size = len(X_train),
                                 vector_size = 300,
                                 vector_type = 'Train')
test_vectors_dbow = get_vectors(model = model_dbow,
                                corpus_size = len(X_test),
                                vector_size = 300,
                                vector_type = 'Train')

Now use these document vectors for our logistic regression.

In [14]:
model_logreg = LogisticRegression(multi_class = 'multinomial', n_jobs = 1, C = 1e5, max_iter = 15000)
model_logreg = model_logreg.fit(train_vectors_dbow, y_train)
y_pred = model_logreg.predict(test_vectors_dbow)

In [15]:
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred, target_names = list_categories))

accuracy 0.18413173652694612
               precision    recall  f1-score   support

entertainment       0.28      0.24      0.26       163
     business       0.12      0.13      0.13       120
        sport       0.17      0.21      0.19       112
     politics       0.20      0.20      0.20       148
         tech       0.13      0.11      0.12       125

     accuracy                           0.18       668
    macro avg       0.18      0.18      0.18       668
 weighted avg       0.19      0.18      0.18       668



Ouch, this is even worse!

In [16]:
object_keep = {'df_bbc': df_bbc,
               'list_categories': list_categories,
               'X': X,
               'y': y,
               'X_train': X_train,
               'X_test': X_test,
               'y_train': y_train,
               'y_test': y_test}
%store object_keep

Stored 'object_keep' (dict)
