In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
dataset = catalog.load('raw_dataset')
vocab = catalog.load('dictionary')

2020-04-03 13:51:29,572 - kedro.io.data_catalog - INFO - Loading data from `raw_dataset` (CSVLocalDataSet)...
2020-04-03 13:51:29,819 - kedro.io.data_catalog - INFO - Loading data from `dictionary` (DictionaryDataSet)...
2020-04-03 13:51:29,820 - gensim.utils - INFO - loading Dictionary object from data/05_model_input/dictionary.dict
2020-04-03 13:51:29,822 - gensim.utils - INFO - loaded data/05_model_input/dictionary.dict


## Pre-process docs

In [3]:
def split_by_sentence(docs):
    tmp = []
    for i, doc in enumerate(docs):
        splitted_doc = doc.split('.\n')
        for sd in splitted_doc:
            sentences = sd.split('. ')
            for s in sentences:
                tmp.append(s)
    return tmp

def lowerize(docs):
    # Convert to lowercase.
    for idx in range(len(docs)):
        docs[idx] = str(docs[idx]).lower()
    return docs

def tokenize(docs):
    # Split into words.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = tokenizer.tokenize(docs[idx])
    return docs

def remove_stop_words(docs):
    stop_words = set(stopwords.words('english'))
    for idx in range(len(docs)):
        docs[idx] = [w for w in docs[idx] if not w in stop_words]
    return docs

def remove_numbers(docs):
    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
    return docs

def remove_word_with_length(docs, length=1):
    # Remove words that are only (length=1) character.
    docs = [[token for token in doc if len(token) > length] for doc in docs]
    return docs

def lemmatize(docs):
    # Lemmatize the documents
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    return docs

def add_bigram(docs, min_bigram_count=20):
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=min_bigram_count)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)
    return docs

def remove_vocab(docs, vocab):
    docs = np.array([[w for w in doc if w in vocab] for doc in docs])
    return docs

In [4]:
min_bigram_count = 20
length = 1
no_below = 100
no_above = 0.70

docs = dataset['text'].values

print('\nSplitting by sentence...')
docs = split_by_sentence(docs)

print('\nLowerizing...')
docs = lowerize(docs)

print('\nTokenizing...')
docs = tokenize(docs)

#print('\nAdding bigrams...')
#docs = add_bigram(docs, min_bigram_count=min_bigram_count)
    
print('\nRemoving stop words...')
docs = remove_stop_words(docs)

print('\nRemoving unique numbers (not words that contain numbers)...')
docs = remove_numbers(docs)

print('\nRemoving words that contain only one character...')
docs = remove_word_with_length(docs, length=length)

print('\nLemmatizing...')
docs = lemmatize(docs)

vocab = Dictionary(docs)
vocab.filter_extremes(no_below=no_below, no_above=no_above)

docs = remove_vocab(docs, list(vocab.token2id))

print('Number of sentences:', len(docs))
print('Number of unique words:', len(vocab))


Splitting by sentence...

Lowerizing...

Tokenizing...

Removing stop words...

Removing unique numbers (not words that contain numbers)...

Removing words that contain only one character...

Lemmatizing...
2020-04-03 13:51:45,336 - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
2020-04-03 13:51:45,518 - gensim.corpora.dictionary - INFO - adding document #10000 to Dictionary(24357 unique tokens: ['anyone', 'car', 'college', 'could', 'day']...)
2020-04-03 13:51:45,685 - gensim.corpora.dictionary - INFO - adding document #20000 to Dictionary(32730 unique tokens: ['anyone', 'car', 'college', 'could', 'day']...)
2020-04-03 13:51:45,889 - gensim.corpora.dictionary - INFO - adding document #30000 to Dictionary(47754 unique tokens: ['anyone', 'car', 'college', 'could', 'day']...)
2020-04-03 13:51:46,062 - gensim.corpora.dictionary - INFO - adding document #40000 to Dictionary(52970 unique tokens: ['anyone', 'car', 'college', 'could', 'day']...)
2020-

## Train doc embeddings (gensim doc2vec)

In [9]:
def read_corpus(docs):
    for i, text in enumerate(docs):
        yield TaggedDocument(text, [i])

In [10]:
corpus = list(read_corpus(docs))

In [11]:
model = Doc2Vec(vector_size=25, min_count=2, epochs=1)



In [12]:
model.build_vocab(corpus)

2020-04-03 13:54:38,083 - gensim.models.doc2vec - INFO - collecting all words and their counts
2020-04-03 13:54:38,086 - gensim.models.doc2vec - INFO - PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-04-03 13:54:38,140 - gensim.models.doc2vec - INFO - PROGRESS: at example #10000, processed 88269 words (1713446/s), 4135 word types, 10000 tags
2020-04-03 13:54:38,164 - gensim.models.doc2vec - INFO - PROGRESS: at example #20000, processed 170352 words (3592848/s), 4190 word types, 20000 tags
2020-04-03 13:54:38,189 - gensim.models.doc2vec - INFO - PROGRESS: at example #30000, processed 262545 words (3808046/s), 4200 word types, 30000 tags
2020-04-03 13:54:38,213 - gensim.models.doc2vec - INFO - PROGRESS: at example #40000, processed 349396 words (3618515/s), 4200 word types, 40000 tags
2020-04-03 13:54:38,236 - gensim.models.doc2vec - INFO - PROGRESS: at example #50000, processed 438275 words (3985932/s), 4200 word types, 50000 tags
2020-04-03 13:54:38,260 - ge

In [13]:
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

2020-04-03 13:55:21,439 - gensim.models.base_any2vec - INFO - training model with 3 workers on 4202 vocabulary and 25 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-04-03 13:55:22,542 - gensim.models.base_any2vec - INFO - EPOCH 1 - PROGRESS: at 5.65% examples, 126975 words/s, in_qsize 6, out_qsize 0
2020-04-03 13:55:23,597 - gensim.models.base_any2vec - INFO - EPOCH 1 - PROGRESS: at 11.88% examples, 138744 words/s, in_qsize 6, out_qsize 0
2020-04-03 13:55:24,721 - gensim.models.base_any2vec - INFO - EPOCH 1 - PROGRESS: at 18.19% examples, 139925 words/s, in_qsize 5, out_qsize 0
2020-04-03 13:55:25,770 - gensim.models.base_any2vec - INFO - EPOCH 1 - PROGRESS: at 24.47% examples, 143035 words/s, in_qsize 6, out_qsize 0
2020-04-03 13:55:26,791 - gensim.models.base_any2vec - INFO - EPOCH 1 - PROGRESS: at 30.59% examples, 145536 words/s, in_qsize 6, out_qsize 0
2020-04-03 13:55:27,847 - gensim.models.base_any2vec - INFO - EPOCH 1 - PROGRESS: at 37.08% examples, 146543 words