# Vectorisation (turning text into numbers)
There are a lot of ways to turn text into numbers. Here, we will use Doc2Vec, but other algorithms do this well including BERT, LDA, LSI etc.

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import pandas as pd
import numpy as np

In [3]:
# read in our data
df = pd.read_csv('data/s2_cr_data.csv', dtype=str)
df.shape

(12617, 13)

In [4]:
# df.sample(10)

# Pre-process
Different algorithms and different text corpora call for different kinds of pre-processing. What we are doing here is trying to make the text as easy as possible for the computer to 'understand' without destroying any valuable features in the data.

In [5]:
from gensim.models import TfidfModel, Phrases
from gensim.models.phrases import Phraser
from gensim.utils import deaccent
from gensim.parsing.preprocessing import strip_short, strip_punctuation, remove_stopwords, strip_multiple_whitespaces, stem_text
from gensim.parsing import preprocess_string
from gensim import corpora
from gensim import utils

## Ngramming
We will start by building Ngrammers. These are very simple models which learn when words tend to appear together. The typical example here is 'New York', which is 2 words, but which we treat as one. Is therefore a 'bigram'. We might also be interested in 'New York City', which is 3 words that often appear together, so that's a "trigram".

We will run 2 processes where we first search our text for bigrams and then trigrams. Essentially, it's just the same process run twice.

In [6]:
bigram_phraserpath = 'models/bigram_d2v'
trigram_phraserpath = 'models/trigram_d2v'

CUSTOM_FILTERS = [lambda x: x.lower(),
                    deaccent,
                    strip_punctuation,
                    strip_multiple_whitespaces,
                    # strip_short, #(minsize=2),
                    ] # stem or lemmatize?

# define Iterator

# retrieve each text entry
def extract_text():

    for i, row in df.iterrows():
        doi = str(row['doi'])
        textdata = str(row['tiabs'])
        textdata = preprocess_string(textdata, CUSTOM_FILTERS)
        yield doi, textdata
        if i%1000==0:
            print(i, 'iterations done')


# iteration 1
print()
print('Training bigram detection')
documents = extract_text()
phrases = Phrases(
                (document[1]
                 for document in documents),
                    # min_count = 10,
                    threshold = 10,
                    common_terms = ["of", "with", "without",
                                "and", "or", "the", "a"]
                    # max_vocab_size = 1000000
                    )
bigram = Phraser(phrases)
bigram.save(bigram_phraserpath)


# iteration 2
print()
print('Training trigram detection')
documents = extract_text()
phrases = Phrases((bigram[document[1]]
                    for document in documents),
                    # min_count = 10,
                    threshold = 10,
                    # max_vocab_size = 2000000
                    )
trigram = Phraser(phrases)
# save model
trigram.save(trigram_phraserpath)
print('trigrams trained and saved')

2020-03-19 17:04:37,670 : INFO : collecting all words and their counts
2020-03-19 17:04:37,673 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types



Training bigram detection
0 iterations done
1000 iterations done
2000 iterations done
3000 iterations done
4000 iterations done
5000 iterations done
6000 iterations done
7000 iterations done
8000 iterations done
9000 iterations done


2020-03-19 17:04:46,617 : INFO : PROGRESS: at sentence #10000, processed 2559767 words and 979691 word types


10000 iterations done
11000 iterations done
12000 iterations done


2020-03-19 17:04:48,772 : INFO : collected 1175333 word types from a corpus of 3193208 words (unigram + bigrams) and 12617 sentences
2020-03-19 17:04:48,773 : INFO : using 1175333 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10, max_vocab_size=40000000>
2020-03-19 17:04:48,773 : INFO : source_vocab length 1175333
2020-03-19 17:04:56,764 : INFO : Phraser built with 16470 phrasegrams
2020-03-19 17:04:56,765 : INFO : saving Phraser object under models/bigram_d2v, separately None
2020-03-19 17:04:56,795 : INFO : saved models/bigram_d2v
2020-03-19 17:04:56,796 : INFO : collecting all words and their counts
2020-03-19 17:04:56,797 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types



Training trigram detection
0 iterations done
1000 iterations done
2000 iterations done
3000 iterations done
4000 iterations done
5000 iterations done
6000 iterations done
7000 iterations done
8000 iterations done
9000 iterations done


2020-03-19 17:05:08,768 : INFO : PROGRESS: at sentence #10000, processed 2192159 words and 996553 word types


10000 iterations done
11000 iterations done
12000 iterations done


2020-03-19 17:05:11,701 : INFO : collected 1190095 word types from a corpus of 2736622 words (unigram + bigrams) and 12617 sentences
2020-03-19 17:05:11,701 : INFO : using 1190095 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10, max_vocab_size=40000000>
2020-03-19 17:05:11,721 : INFO : source_vocab length 1190095
2020-03-19 17:05:21,789 : INFO : Phraser built with 38863 phrasegrams
2020-03-19 17:05:21,790 : INFO : saving Phraser object under models/trigram_d2v, separately None
2020-03-19 17:05:21,852 : INFO : saved models/trigram_d2v


trigrams trained and saved


In [7]:
# pre process for d2v
from gensim.utils import deaccent
from gensim.parsing.preprocessing import strip_short, strip_punctuation, remove_stopwords, strip_multiple_whitespaces, stem_text
from gensim.parsing import preprocess_string
from gensim.models import Phrases


bigram = Phrases.load(bigram_phraserpath)
trigram = Phrases.load(trigram_phraserpath)

def pre_d2v_search(s, bigram, trigram):
    
    CUSTOM_FILTERS = [lambda x: x.lower(),
                    deaccent,
                    strip_punctuation,
                    strip_multiple_whitespaces] 
    
    return trigram[bigram[preprocess_string(s,filters= CUSTOM_FILTERS)]]

2020-03-19 17:05:21,859 : INFO : loading Phrases object from models/bigram_d2v
2020-03-19 17:05:21,879 : INFO : loaded models/bigram_d2v
2020-03-19 17:05:21,881 : INFO : older version of Phrases loaded without corpus_word_count
2020-03-19 17:05:21,882 : INFO : Setting it to 0, do not use it in your scoring function.
2020-03-19 17:05:21,882 : INFO : loading Phrases object from models/trigram_d2v
2020-03-19 17:05:21,932 : INFO : loaded models/trigram_d2v
2020-03-19 17:05:21,937 : INFO : older version of Phrases loaded without corpus_word_count
2020-03-19 17:05:21,937 : INFO : Setting it to 0, do not use it in your scoring function.


# Train

In [8]:
import os
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.summarization.textcleaner import get_sentences
from gensim.utils import tokenize
import multiprocessing

# Preparing the corpus

In [9]:
df = df.reset_index()

In [10]:
df.shape

(12617, 14)

This class will allow us to load our data into the doc2vec model.

In [11]:
class ADTaggedDocument(object):
    def __iter__(self):
        
        for i,row in df.iterrows():
            id_ = str(row['index'])
#             doi = str(row['doi'])
            text_data = str(row['tiabs'])
            tokenized = pre_d2v_search(text_data, bigram, trigram)
            yield TaggedDocument(tokenized, [id_])        

This is where we train the doc2vec model. 

In [12]:
# COMMENT/UNCOMMENT FOR TRAINING

cores = multiprocessing.cpu_count()

documents = ADTaggedDocument()
# model.build_vocab(documents)

model = Doc2Vec(documents = documents,
                    dm=0, 
                    dbow_words=1, 
                    vector_size=300, 
                    window=8, 
                    min_count=6, 
                    epochs=8, 
                    workers= cores-1
#                     alpha=0.06,  # comment these lines to use default alpha
#                     min_alpha=0.04
                   )

# re-initialise generator
documents = ADTaggedDocument()
# Now train Doc2Vec on the corpus
model.train(documents, 
            total_examples=model.corpus_count, 
            epochs=model.epochs)


model.save(os.path.join('models', 'd2v.model'), separately=None)

2020-03-19 17:05:21,970 : INFO : collecting all words and their counts
2020-03-19 17:05:21,975 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-03-19 17:05:34,983 : INFO : PROGRESS: at example #10000, processed 2045253 words (157240/s), 80437 word types, 10000 tags
2020-03-19 17:05:38,159 : INFO : collected 90722 word types and 12617 unique tags from a corpus of 12617 examples and 2555041 words
2020-03-19 17:05:38,159 : INFO : Loading a fresh vocabulary
2020-03-19 17:05:38,209 : INFO : effective_min_count=6 retains 39021 unique words (43% of original 90722, drops 51701)
2020-03-19 17:05:38,210 : INFO : effective_min_count=6 leaves 2458818 word corpus (96% of original 2555041, drops 96223)
2020-03-19 17:05:38,288 : INFO : deleting the raw counts dictionary of 90722 items
2020-03-19 17:05:38,290 : INFO : sample=0.001 downsamples 24 most-common words
2020-03-19 17:05:38,291 : INFO : downsampling leaves estimated 1976940 word corpus (80.4% of prior 24588

In [13]:
model = Doc2Vec.load('models/d2v.model')
model

2020-03-19 17:11:11,393 : INFO : loading Doc2Vec object from models/d2v.model
2020-03-19 17:11:11,604 : INFO : loading vocabulary recursively from models/d2v.model.vocabulary.* with mmap=None
2020-03-19 17:11:11,605 : INFO : loading trainables recursively from models/d2v.model.trainables.* with mmap=None
2020-03-19 17:11:11,606 : INFO : loading syn1neg from models/d2v.model.trainables.syn1neg.npy with mmap=None
2020-03-19 17:11:11,644 : INFO : loading wv recursively from models/d2v.model.wv.* with mmap=None
2020-03-19 17:11:11,647 : INFO : loading vectors from models/d2v.model.wv.vectors.npy with mmap=None
2020-03-19 17:11:11,682 : INFO : loading docvecs recursively from models/d2v.model.docvecs.* with mmap=None
2020-03-19 17:11:11,682 : INFO : loaded models/d2v.model


<gensim.models.doc2vec.Doc2Vec at 0x1aae9db0508>

Now we transform our corpus of text data using the doc2vec model to give us a 'd2v_corpus'.  We can visualise this data to get a feel for the dataset.

In [14]:
import numpy as np
from gensim.matutils import any2sparse

def iter_df(df):
    for i, row in df.iterrows():
#         doi = str(row['doi'])
        id_ = str(row['index'])
#             doi = str(row['doi'])
        textdata = str(row['tiabs'])
        ngrams = pre_d2v_search(textdata, bigram, trigram)
        yield id_, ngrams


def iter_arts(df):
    articles = iter_df(df)
    for article in articles:
        textdata = article[1]
        vec = model.infer_vector(textdata) # can't this be done as a batch/vector process? SLOW!
        vec = any2sparse(vec, eps=1e-09)
#         print(np.shape(vec))
        yield vec
        



In [15]:
from gensim import corpora, similarities
transformed = corpora.MmCorpus.serialize('data/d2v_corpus.mm', iter_arts(df))
index_d2v = similarities.Similarity('data/d2v_sims.index',
                                      corpora.MmCorpus('data/d2v_train_corpus.mm'),
                                      num_features=300)
index_d2v.save()

2020-03-19 17:11:11,773 : INFO : storing corpus in Matrix Market format to data/d2v_corpus.mm
2020-03-19 17:11:11,778 : INFO : saving sparse matrix to data/d2v_corpus.mm
2020-03-19 17:11:11,786 : INFO : PROGRESS: saving document #0
2020-03-19 17:11:16,254 : INFO : PROGRESS: saving document #1000
2020-03-19 17:11:20,632 : INFO : PROGRESS: saving document #2000
2020-03-19 17:11:24,925 : INFO : PROGRESS: saving document #3000
2020-03-19 17:11:29,279 : INFO : PROGRESS: saving document #4000
2020-03-19 17:11:33,715 : INFO : PROGRESS: saving document #5000
2020-03-19 17:11:38,043 : INFO : PROGRESS: saving document #6000
2020-03-19 17:11:42,435 : INFO : PROGRESS: saving document #7000
2020-03-19 17:11:46,708 : INFO : PROGRESS: saving document #8000
2020-03-19 17:11:51,337 : INFO : PROGRESS: saving document #9000
2020-03-19 17:11:55,661 : INFO : PROGRESS: saving document #10000
2020-03-19 17:11:59,843 : INFO : PROGRESS: saving document #11000
2020-03-19 17:12:03,933 : INFO : PROGRESS: saving d

ValueError: need more than 0 values to unpack

In [None]:
corpora.MmCorpus.serialize('data/d2v_corpus.mm', iter_arts(df))