 # Best Practices for Preprocessing Natural Language Data
In this notebook, we improve the quality of our Project Gutenberg word vectors by adopting best-practices for preprocessing natural language data.

# Load dependencies

In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
%matplotlib inline

In [None]:
nltk.download('punkt')

In [None]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.models.phrases import Phraser, Phrases
from keras.preprocessing.text import one_hot

In [None]:
nltk.download('stopwords')

# Load data

In [None]:
nltk.download('gutenberg')

In [None]:
from nltk.corpus import gutenberg
gberg_sents = gutenberg.sents()

# Iteratively preprocess a sentence:

In [None]:
gberg_sents[4]

# to lowercase:

In [None]:
[w.lower() for w in gberg_sents[4]]

# remove stopwords and punctuation:

In [None]:
stpwrds = stopwords.words('english') + list(string.punctuation)

In [None]:
stpwrds

In [None]:
[w.lower() for w in gberg_sents[4] if w not in stpwrds]

# stem words:

In [None]:
stemmer = PorterStemmer()

In [None]:
[stemmer.stem(w.lower()) for w in gberg_sents[4] if w not in stpwrds]

# handle bigram collocations:

In [None]:
phrases = Phrases(gberg_sents)

In [None]:
bigram = Phraser(phrases)

In [None]:
bigram.phrasegrams 

In [None]:
"Jon lives in New York City".split()

In [None]:
bigram["Jon lives in New York City".split()]

# Preprocess the corpus

In [None]:
lower_sents = []
for s in gberg_sents:
    lower_sents.append([w.lower() for w in s if w not in list(string.punctuation)])

In [None]:
lower_sents[0:5]

In [None]:
lower_bigram = Phraser(Phrases(lower_sents))

In [None]:
lower_bigram.phrasegrams

In [None]:
lower_bigram["jon lives in new york city".split()]

In [None]:
lower_bigram = Phraser(Phrases(lower_sents, min_count=32, threshold=64))
lower_bigram.phrasegrams

In [None]:
clean_sents = []
for s in lower_sents:
    clean_sents.append(lower_bigram[s])

In [None]:
clean_sents[0:9]

In [None]:
clean_sents[6]

# Run word2vec

In [None]:
model = Word2Vec(sentences=clean_sents, size=64, sg=1, window=10, min_count=10, seed=42, workers=8)
model.save('clean_gutenberg_model.w2v')

# Explore model

In [None]:
model = gensim.models.Word2Vec.load('../input/dataset/clean_gutenberg_model.w2v')

In [None]:
len(model.wv.vocab)

In [None]:
model['ma_am']

In [None]:
model.most_similar('ma_am') 

In [None]:
model.most_similar(positive=['ma_am', 'man'], negative=['woman'])

In [None]:
model.most_similar(positive=['father', 'woman'], negative=['man']) 

# Reduce word vector dimensionality with t-SNE

In [None]:
tsne = TSNE(n_components=2, n_iter=1000)

In [None]:
X_2d = tsne.fit_transform(model[model.wv.vocab])

In [None]:
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [None]:
coords_df.head()

In [None]:
coords_df.to_csv('clean_gutenberg_tsne.csv', index=False)

# Visualise

In [None]:
coords_df = pd.read_csv('../input/dataset/clean_gutenberg_tsne.csv')

In [None]:
_ = coords_df.plot.scatter('x', 'y', figsize=(12,12), marker='.', s=10, alpha=0.2)