In [None]:
from gensim.models import Word2Vec, FastText
import pandas as pd
import re

from sklearn.decomposition import PCA

from matplotlib import pyplot

import warnings
warnings.filterwarnings('ignore')

In [None]:
sentences = [['i', 'like', 'apple', 'pie', 'for', 'dessert'],
            ['i', 'dont', 'drive', 'fast', 'cars'],
            ['data', 'science', 'is', 'fun'],
            ['chocolate', 'is', 'my', 'favorite'],
            ['my', 'favorite', 'movie', 'is', 'predator']]

sentences2 = [["I like apple pie"],
            ['i', 'dont', 'drive', 'fast', 'cars'],
            ['data', 'science', 'is', 'fun'],
            ['chocolate', 'is', 'my', 'favorite'],
            ['my', 'favorite', 'movie', 'is', 'predator']]

In [None]:
# train word2vec model
w2v = Word2Vec(sentences2, min_count=1, size = 5)
print(w2v)

#train fastText model
ft = FastText(sentences, min_count=1, size = 5)
print(ft)

In [None]:
# summarize vocabulary
words1 = list(w2v.wv.vocab)
print(words1)
words2 = list(ft.wv.vocab)
print(words2)

In [None]:
# access vector for one word
print(w2v['chocolate'])
print(ft['chocolate'])

In [None]:
X = w2v[w2v.wv.vocab]
pca = PCA(n_components=2)

result = pca.fit_transform(X)

# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(w2v.wv.vocab)

for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))

pyplot.show()

In [None]:
X = ft[ft.wv.vocab]
pca = PCA(n_components=2)

result = pca.fit_transform(X)

# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(ft.wv.vocab)

for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))

pyplot.show()

In [None]:
df = pd.read_csv('emails.csv')

In [None]:
df.head()

In [None]:
clean_txt = []
for w in range(len(df.text)):
    desc = df['text'][w].lower()
    
    #remove punctuation
    desc = re.sub('[^a-zA-Z]', ' ', desc)
    
    #remove tags
    desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)
    
    #remove digits and special chars
    desc=re.sub("(\\d|\\W)+"," ",desc)
    clean_txt.append(desc)


In [None]:
df['clean'] = clean_txt

In [None]:
corpus = []
for col in df.clean:
    word_list = col.split(" ")
    corpus.append(word_list)
#corpus

In [None]:
model = Word2Vec(corpus, min_count=1, size = 56)

In [None]:
X = model[model.wv.vocab]

pca = PCA(n_components=2)

result = pca.fit_transform(X)

# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)

for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))

pyplot.show()

In [None]:
test = pd.DataFrame(result, columns = ['x','y'])
test['word'] = words
test.head()

In [None]:
import plotly.graph_objects as go
import numpy as np

N = 1000000
words = list(model.wv.vocab)
fig = go.Figure(data=go.Scattergl(
    x = test['x'],
    y = test['y'],
    mode='markers',
    marker=dict(
        color=np.random.randn(N),
        colorscale='Viridis',
        line_width=1
    ),
    text=test['word'],
    textposition="bottom center"
))

fig.show()

In [None]:
words
