In [None]:
from gensim.models import Word2Vec, FastText
import pandas as pd
import re

from sklearn.decomposition import PCA

from matplotlib import pyplot as plt
import plotly.graph_objects as go

import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
#!pip install gensim


In [None]:
sentences = [['i', 'like', 'apple', 'pie', 'for', 'dessert'],
            ['i', 'dont', 'drive', 'fast', 'cars'],
            ['data', 'science', 'is', 'fun'],
            ['chocolate', 'is', 'my', 'favorite'],
            ['my', 'favorite', 'movie', 'is', 'predator']]


In [None]:
# train word2vec model
w2v = Word2Vec(sentences, min_count=1, size = 5)
print(w2v)

#train fastText model
ft = FastText(sentences, min_count=1, size = 5)
print(ft)

In [None]:
# summarize vocabulary
words1 = list(w2v.wv.vocab)
print(words1)
w2v.wv.vocab

In [None]:
# access vector for one word
print(w2v['chocolate'])
print(ft['chocolate'])

In [None]:
X = w2v[w2v.wv.vocab]
pca = PCA(n_components=2)

result = pca.fit_transform(X)

# create a scatter plot of the projection
plt.scatter(result[:, 0], result[:, 1])
words = list(w2v.wv.vocab)

for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))

plt.show()

In [None]:
X = ft[ft.wv.vocab]
pca = PCA(n_components=2)

result = pca.fit_transform(X)

# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(ft.wv.vocab)

for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))

pyplot.show()

In [None]:
df = pd.read_csv('emails.csv')

In [None]:
df.head()

In [None]:
clean_txt = []
for w in range(len(df.text)):
    desc = df['text'][w].lower()
    
    #remove punctuation
    desc = re.sub('[^a-zA-Z]', ' ', desc)
    
    #remove tags
    desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)
    
    #remove digits and special chars
    desc=re.sub("(\\d|\\W)+"," ",desc)
    clean_txt.append(desc)

df['clean'] = clean_txt

In [None]:
df.head()

In [None]:
corpus = []
for col in df.clean:
    word_list = col.split(" ")
    corpus.append(word_list)
corpus[0:1]

In [None]:
model = Word2Vec(corpus, min_count=1, size = 56)

In [None]:
X = model[model.wv.vocab]

pca = PCA(n_components=2)

result = pca.fit_transform(X)

# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)

for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))

pyplot.show()

In [None]:
#pass the embeddings to PCA
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

#create df from the pca results
pca_df = pd.DataFrame(result, columns = ['x','y'])

#add the words for the hover effect
pca_df['word'] = words
pca_df.head()

In [None]:
N = 1000000
words = list(model.wv.vocab)
fig = go.Figure(data=go.Scattergl(
    x = pca_df['x'],
    y = pca_df['y'],
    mode='markers',
    marker=dict(
        color=np.random.randn(N),
        colorscale='Viridis',
        line_width=1
    ),
    text=pca_df['word'],
    textposition="bottom center"
))

fig.show()

In [None]:
model.wv.most_similar('eric')

In [None]:
model.wv.most_similar('start')

In [None]:
model.wv.most_similar_cosmul(positive = ['phone', 'number'], negative = ['call'])

In [None]:
model.wv.doesnt_match("phone number prison cell".split())

In [None]:
#save embeddings
file = 'email_embd.txt'
model.wv.save_word2vec_format(file, binary = False)

In [None]:
# import os
# embeddings_index = {}

# f = open(os.path.join('', 'email_embd.txt'), encoding = 'utf-8')
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:])
#     embeddings_index[word] = coefs
# f.close

In [None]:
#embeddings_index

In [None]:
#https://www.kaggle.com/reiinakano/basic-nlp-bag-of-words-tf-idf-word2vec-lstm
import nltk

class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)


In [None]:
def transform(X):
    #X = MyTokenizer().fit_transform(X)

    return np.array([
        np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                or [np.zeros(self.dim)], axis=0)
        for words in X
    ])

t = transform(words)

In [None]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(model)
mean_embedded = mean_embedding_vectorizer.fit_transform(df['clean'])

In [None]:
nltk.download('punkt')

In [None]:
mean_embedded[0]

In [None]:
df['array'] = list(mean_embedded)

In [None]:
df