# Best Practices for NLP

Creating word vectors after preprocessing our natural language data.

#### Load Dependencies

In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
%matplotlib inline

In [None]:
nltk.download('punkt')

In [None]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.models.phrases import Phraser, Phrases
from keras.preprocessing.text import one_hot

In [None]:
nltk.download('stopwords')

#### Load Data

In [None]:
nltk.download('gutenberg')

In [None]:
from nltk.corpus import gutenberg

In [None]:
# Due to lack of resources, I'm not working with the full Gutenberg dataset (18 books).
gberg_sents = gutenberg.sents(fileids=['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'carroll-alice.txt'])

#### Preprocess a Sentence

In [None]:
gberg_sents[4]

#### Convert to Lowercase

In [None]:
[w.lower() for w in gberg_sents[4]]

#### Remove Stopwords and Punctuations

In [None]:
stopwords = stopwords.words('english') + list(string.punctuation)

In [None]:
stopwords

In [None]:
[w.lower() for w in gberg_sents[4] if w not in stopwords]

#### Stem Words

In [None]:
stemmer = PorterStemmer()

In [None]:
[stemmer.stem(w.lower()) for w in gberg_sents[4] if w not in stopwords]

#### Handle Bigrams Colocations

In [None]:
phrases = Phrases(gberg_sents)

In [None]:
bigram = Phraser(phrases)

In [None]:
bigram.phrasegrams

In [None]:
'Jon lives in New York City'.split()

In [None]:
bigram['Jon lives in New York City'.split()]

#### Preprocess the Corpus

In [None]:
lower_sents = []
for s in gberg_sents:
    lower_sents.append([w.lower() for w in s if w not in list(string.punctuation)])

In [None]:
lower_sents[0:5]

In [None]:
lower_bigram = Phraser(Phrases(lower_sents))

In [None]:
lower_bigram.phrasegrams

In [None]:
lower_bigram['jon lives in new york city'.split()]

In [None]:
lower_bigram = Phraser(Phrases(lower_sents, min_count=20, threshold=50))

In [None]:
lower_bigram.phrasegrams

In [None]:
clean_sents = []
for s in lower_sents:
    clean_sents.append(lower_bigram[s])

In [None]:
clean_sents[0:10]

#### Run word2vec

In [None]:
# size == dimensions
# window 10: 20 context words, 10 to the left and 10 to the right
model = Word2Vec(sentences=clean_sents, size=64, sg=1, window=10, min_count=5, seed=42, workers=2)
model.save('clean_gutenberg_model.w2v')

In [None]:
# We don't have to save the model if we don't want to. It's being done here as demonstration.
model = Word2Vec.load('clean_gutenberg_model.w2v')

In [None]:
len(model.wv.vocab)

#### Explore the Model

In [None]:
model['miss_taylor']

In [None]:
model.most_similar(positive=['miss_taylor', 'man'], negative=['woman'])

In [None]:
model.most_similar(positive=['father', 'woman'], negative=['man'])

#### Reduce Dimensionality

In [None]:
X = model[model.wv.vocab]
tsne = TSNE(n_components=2, n_iter=200)
X_2d = tsne.fit_transform(X)

In [None]:
coords_df = pd.DataFrame(X_2d, columns=['x', 'y'])
coords_df['token'] = model.wv.vocab.keys()

In [None]:
coords_df.head()

In [None]:
coords_df.to_csv('clean_gutenberg_tsne.csv', index=False)

#### Visualise 2D representation of word vectors

In [None]:
# We don't have to read the model if we don't want to. It's being done here as demonstration.
coorrds_df = pd.read_csv('raw_gutenberg_tsne.csv')
coords_df.head()

In [None]:
_ = coords_df.plot.scatter('x', 'y', figsize=(8,8), marker='.', s=10, alpha=0.2)

In [None]:
output_notebook()

In [None]:
subset_df = coords_df.sample(n=1000)
p = figure(plot_width=600, plot_height=600)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)
show(p)

In [None]:
output_file('vector_space')