# **Autoencoder Model for Word Embedding**

## I. Import necessary things

In [68]:
import tensorflow as tf
from tensorflow import keras
import re
import nltk
import numpy as np
from scipy.spatial import distance

# nltk.download('stopwords')
# nltk.download('punkt')

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

In [69]:
def text_cleaner(text):
    # lower case text
    text = text.lower()
    # remove punctuations
    text = re.sub("[^a-zA-Z]", " ", text)
    return text

In [70]:
def preprocess_text(corpus):
  corpus_clean = text_cleaner(corpus)
  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(corpus_clean)
  
  return [w for w in word_tokens if not w in stop_words]

## II. Load corpus and preprocess it

In [71]:
corpus = """Donald J. Trump is the 45th President of the United States. He believes the United States has incredible potential and will go on to exceed even its remarkable achievements of the past.
Donald J. Trump defines the American success story. Throughout his life he has continually set the standards of business and entrepreneurial excellence, especially in real estate, sports, and entertainment. Mr. Trump built on his success in private life when he entered into politics and public service. He remarkably won the Presidency in his first ever run for any political office.
A graduate of the University of Pennsylvania’s Wharton School of Finance, Mr. Trump followed in his father’s footsteps into the world of real estate development, making his mark in New York City. There, the Trump name soon became synonymous with the most prestigious of addresses in Manhattan and, subsequently, throughout the world.
Mr. Trump is also an accomplished author. He has written more than fourteen bestsellers.  His first book, The Art of the Deal, is considered a business classic.
Mr. Trump announced his candidacy for the Presidency on June 16, 2015. He then accepted the Republican nomination for President of the United States in July of 2016, having defeated 17 other contenders during the Republican primaries.
On November 8, 2016, Mr. Trump was elected President in the largest Electoral College landslide for a Republican in 28 years. Mr. Trump won more than 2,600 counties nationwide, the most since President Ronald Reagan in 1984. He received the votes of more than 62 million Americans, the most ever for a Republican candidate.
President Trump has delivered historic results in his first term in office despite partisan gridlock in the Nation’s Capital, and resistance from special interests and the Washington Establishment.
He passed record-setting tax cuts and regulation cuts, achieved energy independence, replaced NAFTA with the United-States-Mexico-Canada Agreement, invested $2 trillion to completely rebuild the Military, launched the Space Force, obliterated the ISIS Caliphate, achieved a major breakthrough for peace in the Middle East, passed the most significant Veterans Affairs reforms in half a century, confirmed over 250 federal judges, including 2 Supreme Court Justices, signed bipartisan Criminal Justice Reform, lowered drug prices, protected Medicare and Social Security, and secured our nation’s borders.
To vanquish the COVID-19 global pandemic, President Trump launched the greatest national mobilization since World War II. The Trump Administration enacted the largest package of financial relief in American history, created the most advanced testing system in the world, developed effective medical treatments to save millions of lives, and launched Operation Warp Speed to deliver a vaccine in record time and defeat the Virus.
President Trump has been married to his wife, Melania, for 15 years, and they are parents to their son, Barron. Mr. Trump also has four adult children, Don Jr., Ivanka, Eric, and Tiffany, as well as 10 grandchildren."""

In [72]:
corpus

'Donald J. Trump is the 45th President of the United States. He believes the United States has incredible potential and will go on to exceed even its remarkable achievements of the past.\nDonald J. Trump defines the American success story. Throughout his life he has continually set the standards of business and entrepreneurial excellence, especially in real estate, sports, and entertainment. Mr. Trump built on his success in private life when he entered into politics and public service. He remarkably won the Presidency in his first ever run for any political office.\nA graduate of the University of Pennsylvania’s Wharton School of Finance, Mr. Trump followed in his father’s footsteps into the world of real estate development, making his mark in New York City. There, the Trump name soon became synonymous with the most prestigious of addresses in Manhattan and, subsequently, throughout the world.\nMr. Trump is also an accomplished author. He has written more than fourteen bestsellers.  H

In [73]:
corpus_token =  preprocess_text(corpus)
print(corpus_token)

['donald', 'j', 'trump', 'th', 'president', 'united', 'states', 'believes', 'united', 'states', 'incredible', 'potential', 'go', 'exceed', 'even', 'remarkable', 'achievements', 'past', 'donald', 'j', 'trump', 'defines', 'american', 'success', 'story', 'throughout', 'life', 'continually', 'set', 'standards', 'business', 'entrepreneurial', 'excellence', 'especially', 'real', 'estate', 'sports', 'entertainment', 'mr', 'trump', 'built', 'success', 'private', 'life', 'entered', 'politics', 'public', 'service', 'remarkably', 'presidency', 'first', 'ever', 'run', 'political', 'office', 'graduate', 'university', 'pennsylvania', 'wharton', 'school', 'finance', 'mr', 'trump', 'followed', 'father', 'footsteps', 'world', 'real', 'estate', 'development', 'making', 'mark', 'new', 'york', 'city', 'trump', 'name', 'soon', 'became', 'synonymous', 'prestigious', 'addresses', 'manhattan', 'subsequently', 'throughout', 'world', 'mr', 'trump', 'also', 'accomplished', 'author', 'written', 'fourteen', 'bests

In [74]:
# dictionary is a list of all tokenized words in corpus_token sorted by alphabet
# it can be think of list of unique vocabularies in all documents
dictionary = sorted(list(set(corpus_token)))
# print(dictionary)

In [75]:
mapping = dict((c, i) for i, c in enumerate(dictionary))
# mapping

## III. Convert corpus to one-hot vectors & Define embedding dim

In [76]:
embedding_dim = 20
vocab_size = len(dictionary)
print(vocab_size)

229


In [77]:
corpus_encode = [mapping[x] for x in corpus_token]
onehot_corpus = keras.utils.to_categorical(corpus_encode, num_classes=vocab_size)
print(onehot_corpus)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## IV. Define Autoencoder model

In [78]:
ae_model = keras.Sequential()
# ae_model.add(keras.Input(shape=(vocab_size,)))
ae_model.add(keras.layers.Dense(embedding_dim, input_dim=vocab_size, activation='relu'))
ae_model.add(keras.layers.Dense(vocab_size, input_dim=embedding_dim, activation='softmax'))

ae_model.compile(optimizer='adam', loss='categorical_crossentropy')

ae_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 20)                4600      
_________________________________________________________________
dense_7 (Dense)              (None, 229)               4809      
Total params: 9,409
Trainable params: 9,409
Non-trainable params: 0
_________________________________________________________________


## V. Train AE model

In [79]:
ae_model.fit(x=onehot_corpus, y=onehot_corpus, batch_size=32, epochs=1000)
ae_model.save("ae_model.h5")

2/1000
Epoch 773/1000
Epoch 774/1000
Epoch 775/1000
Epoch 776/1000
Epoch 777/1000
Epoch 778/1000
Epoch 779/1000
Epoch 780/1000
Epoch 781/1000
Epoch 782/1000
Epoch 783/1000
Epoch 784/1000
Epoch 785/1000
Epoch 786/1000
Epoch 787/1000
Epoch 788/1000
Epoch 789/1000
Epoch 790/1000
Epoch 791/1000
Epoch 792/1000
Epoch 793/1000
Epoch 794/1000
Epoch 795/1000
Epoch 796/1000
Epoch 797/1000
Epoch 798/1000
Epoch 799/1000
Epoch 800/1000
Epoch 801/1000
Epoch 802/1000
Epoch 803/1000
Epoch 804/1000
Epoch 805/1000
Epoch 806/1000
Epoch 807/1000
Epoch 808/1000
Epoch 809/1000
Epoch 810/1000
Epoch 811/1000
Epoch 812/1000
Epoch 813/1000
Epoch 814/1000
Epoch 815/1000
Epoch 816/1000
Epoch 817/1000
Epoch 818/1000
Epoch 819/1000
Epoch 820/1000
Epoch 821/1000
Epoch 822/1000
Epoch 823/1000
Epoch 824/1000
Epoch 825/1000
Epoch 826/1000
Epoch 827/1000
Epoch 828/1000
Epoch 829/1000
Epoch 830/1000
Epoch 831/1000
Epoch 832/1000
Epoch 833/1000
Epoch 834/1000
Epoch 835/1000
Epoch 836/1000
Epoch 837/1000
Epoch 838/1000
Epo

# **How to use trained model**

## VI. Load trained model & Get output of 1st FC layer

In [84]:
reconstructed_model = keras.models.load_model("ae_model.h5")
we_model = keras.models.Model(inputs=reconstructed_model.inputs, outputs=reconstructed_model.get_layer('dense_7').output)

In [85]:
def encode_onehot(mapping, list_words):
  output = []
  
  for word in list_words:
    word_vector = [0 for _ in range(vocab_size)]

    if word in mapping.keys():
      word_index = mapping[word]
      word_vector[word_index] = 1
    
    output.append(word_vector)
  
  return output

## VII. Test model

In [86]:
input_sentence = "Trump is the greatest President of American"

preprocess_sentence = preprocess_text(input_sentence)
onehot_sentence = encode_onehot(mapping, preprocess_sentence)

embedded_sentence = we_model.predict(onehot_sentence)
print(embedded_sentence)

[[1.93307301e-10 8.34066358e-11 1.52166649e-07 4.79718835e-07
  5.68480159e-11 1.79508852e-10 4.64595810e-13 5.08789246e-13
  4.50998243e-08 1.09208786e-09 7.04806746e-09 1.11675326e-05
  3.79410420e-10 7.03189153e-06 2.86077517e-09 1.06475284e-09
  1.57708213e-09 2.80699353e-10 7.11554649e-10 5.11228322e-08
  2.87175145e-10 1.64380261e-08 4.07144496e-09 3.82752640e-13
  8.29345055e-08 8.10878795e-11 7.86554877e-10 5.06579119e-11
  1.15511810e-11 2.81668466e-10 3.90626637e-12 1.67684084e-06
  7.14150581e-13 6.20018836e-09 1.03005625e-10 7.01199168e-11
  1.53181944e-14 1.39080275e-11 3.83727745e-13 4.93147678e-13
  1.86924911e-12 6.68042921e-12 1.80362825e-09 5.39019551e-10
  7.63881958e-10 3.04423316e-13 1.38761125e-09 1.98367503e-11
  6.36973425e-08 7.74652131e-12 1.29740108e-09 2.04529046e-10
  1.68020209e-12 2.26358918e-10 1.23812210e-11 9.94884175e-11
  2.78199312e-08 1.46329583e-14 9.69754197e-08 1.70593747e-10
  4.96443562e-08 1.00964390e-10 1.06407026e-06 1.18789336e-11
  5.9483

In [87]:
word_0 = "American"
word_1 = "Trump"
word_2 = "peace"

preprocess_words = preprocess_text(' '.join([word_0, word_1, word_2]))
onehot_words = encode_onehot(mapping, preprocess_words)

word_0_eb, word_1_eb, word_2_eb = we_model.predict(onehot_words)

print("OUTPUT EMBEDDING")
print(word_0_eb)
print(word_1_eb)
print(word_2_eb)

dst_0_1 = distance.euclidean(word_0_eb, word_1_eb)
dst_1_2 = distance.euclidean(word_1_eb, word_2_eb)
dst_2_0 = distance.euclidean(word_2_eb, word_0_eb)

print("OUTPUT DISTANCE")
print("0 vs 1: ", dst_0_1)
print("1 vs 2: ", dst_1_2)
print("2 vs 0: ", dst_2_0)

OUTPUT EMBEDDING
[5.15289766e-09 3.47504100e-12 1.46682908e-07 8.31000957e-09
 3.42772338e-12 1.85682558e-09 1.93532495e-12 1.77080781e-10
 2.39461372e-07 9.62842748e-08 1.03666205e-08 9.99736130e-01
 2.35075213e-08 2.45505507e-05 2.98675279e-10 3.52947438e-07
 2.91715576e-08 1.22094601e-08 1.51009527e-09 3.18424791e-05
 2.31781030e-08 5.38037768e-13 4.31523567e-11 9.51748369e-09
 1.49888692e-07 6.46766390e-11 1.42435397e-07 8.17413899e-13
 2.09107134e-10 2.88423396e-10 4.63647837e-07 1.20904611e-08
 5.13895263e-12 1.64953362e-05 4.62300684e-11 2.43384982e-11
 5.48536402e-14 3.74416997e-10 5.88802451e-10 5.32076716e-10
 1.60343649e-13 2.27587844e-11 1.67082380e-05 1.92418716e-12
 5.65612973e-11 5.16638021e-09 1.94438670e-08 4.83424323e-08
 5.36768141e-08 1.08396229e-07 1.13993690e-06 1.96026466e-08
 7.12117709e-15 1.11710037e-07 4.21817081e-10 1.86842328e-07
 5.58059867e-07 2.31604472e-10 2.79354606e-09 8.84160190e-10
 4.33175182e-06 2.74938543e-06 1.20321620e-05 1.33426560e-11
 3.0754