# Advanced Feature Engineering Models

In [14]:
# Load Bible Corpus
from nltk.corpus import gutenberg
from string import punctuation
import nlp_models as nm
import preprocessing as pre
from importlib import reload
import numpy as np

bible = gutenberg.sents('bible-kjv.txt')
remove_terms = punctuation + '0123456789'

norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None, pre.normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

print('Total lines:', len(bible))
print('\nSample line:', bible[10])
print('\nProcessed line:', norm_bible[10])


Total lines: 30103

Sample line: ['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']

Processed line: god said let firmament midst waters let divide waters waters


## Robust Word2Vec Models with Gensim

In [8]:
%%time

# build model
w2v_model = nm.trainword2vec(norm_bible)

CPU times: user 39.3 s, sys: 273 ms, total: 39.6 s
Wall time: 14.6 s


In [11]:
# view similar words based off model
similar_words = nm.viewsimilarwords(w2v_model, search_term_list=['god', 'jesus', 'noah', 'egypt', 
                                                                  'john', 'gospel', 'moses', 'famine'])
similar_words

{'god': ['lord', 'predestinated', 'redeemed', 'worldly', 'mercy'],
 'jesus': ['messias', 'peter', 'synagogue', 'apostles', 'immediately'],
 'noah': ['shem', 'japheth', 'ham', 'enosh', 'kenan'],
 'egypt': ['pharaoh', 'egyptians', 'bondage', 'rod', 'flowing'],
 'john': ['james', 'baptist', 'peter', 'baptism', 'galilee'],
 'gospel': ['christ', 'faith', 'godly', 'hope', 'sufferings'],
 'moses': ['congregation', 'aaron', 'joshua', 'sinai', 'children'],
 'famine': ['pestilence', 'peril', 'sword', 'blasting', 'mildew']}

## Applying Word2Vec Features for ML Tasks

In [19]:
# type up corpus and labels
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages, and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!']

labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

# normalize corpus
norm_corpus = pre.normalize_corpus(corpus)

In [25]:
# train model
w2v_model = nm.trainword2vec(norm_corpus, feature_size=10, window_context=10, iterations=100)

note: can't get same embedding visualizations as the book