In [None]:

###################################
##### Preparation Stage ###########
###################################

# Loading the IMDB dataset
#******************************
from tensorflow.keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

train_data[0]
train_labels[0]

max([max(sequence) for sequence in train_data])


# Decoding reviews back to text - Just to be able to see the original text
#************************************************************************
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_first_review = " ".join([reverse_word_index.get(i - 3, "?") for i in train_data[0]])
decoded_first_review

#Preparing the data
#Encoding the integer sequences via multi-hot encoding
#*****************************************************
import numpy as np
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        for j in sequence:
            results[i, j] = 1.
    return results
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

x_train[0] # the first review after one hot encoding

y_train = np.asarray(train_labels).astype("float32")
y_test = np.asarray(test_labels).astype("float32")

# Building your model
# Model definition
#******************************
from tensorflow import keras
from tensorflow.keras import layers


###############################################
##### Hyper Parameters tuning phase ###########
###############################################

first_layer_size = 4
second_layer_size = 4

model = keras.Sequential([
    layers.Dense(first_layer_size, activation="relu"),
    layers.Dense(second_layer_size, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

# Show the structure of the weights, 
# this is not a mandatory stage
# **********************************
model.build(x_train.shape)
model.summary()


# Compiling the model
#******************************
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])

# Validating your approach
# Setting aside a validation set
#*******************************
x_val = x_train[:10000] # to be used for validation
y_val = y_train[:10000] # to be used for validation

partial_x_train = x_train[10000:] 
partial_y_train = y_train[10000:]


# Training your model
#*******************************
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))

history_dict = history.history
history_dict.keys()

# Plotting the training and validation loss
#******************************************
import matplotlib.pyplot as plt
history_dict = history.history
loss_values = history_dict["loss"]
val_loss_values = history_dict["val_loss"]
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, "bo", label="Training loss")
plt.plot(epochs, val_loss_values, "b", label="Validation loss")
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()


# Plotting the training and validation accuracy
#**********************************************
plt.clf()
acc = history_dict["accuracy"]
val_acc = history_dict["val_accuracy"]
plt.plot(epochs, acc, "bo", label="Training acc")
plt.plot(epochs, val_acc, "b", label="Validation acc")
plt.title("Training and validation accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


#################################################
##### Test the result on Test DataSet ###########
#################################################

# Replace the hyper parameters :
#   number of neurons in each layer
#   number of layers
#   activation function
#   number of epochs

# Retraining a model from scratch to the optimal point before overfitting
#********************************
model = keras.Sequential([
    layers.Dense(first_layer_size, activation="relu"),
    layers.Dense(second_layer_size, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.fit(x_train, y_train, epochs=4, batch_size=512)
results = model.evaluate(x_test, y_test)

print("***test results***")
results

# Using a trained model to generate predictions on new data
#**********************************************************
print("**** test prediction *****")
model.predict(x_test)




In [None]:
#https://github.com/nitwmanish/An-Intuitive-Introduction-Of-Word2Vec-By-Building-A-Word2Vec-From-Scratch/blob/master/An-Intuitive-Introduction-Of-Word2Vec-By-Building-A-Word2Vec-From-Scratch.ipynb

import numpy as np

docs = ["I like watching movie", "I enjoy watching movie", "I like viewing movie", "I like hearing song"]
sentance_length = 4


#docs[0][0].split()

# create a one-hot encoding vector

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.0, token_pattern=r"\b\w+\b")
vectorizer.fit(docs)
print(vectorizer.vocabulary_)

# encode document
vector = vectorizer.transform(docs)
# summarize encoded vector
print("shape",vector.shape)
print("type",type(vector))
print("values",vector.toarray())

# create the input and output dataset
x = []
y = []
for i in range(len(docs)):
    for j in range(len(docs[i].split())):
        t_x = []
        t_y = []
        for k in range(sentance_length):
            if(j==k):
                t_y.append(docs[i].split()[k])
                continue
            else:
                t_x.append(docs[i].split()[k])
        x.append(t_x)
        y.append(t_y)

x2 = []
y2 = []
for i in range(len(x)):
    x2.append(' '.join(x[i]))
    y2.append(' '.join(y[i]))

# transfor the input and output into vectors
vector_x = vectorizer.transform(x2)
vx = vector_x.toarray()

vector_y = vectorizer.transform(y2)
vy = vector_y.toarray()

vocSize = len(vectorizer.vocabulary_)
hidden_layer_size = 2

from keras.models import Sequential
from keras.layers import Dense
#from keras import backend as K
from keras import backend

backend.clear_session()


vocSize = len(vectorizer.vocabulary_)  # Vocabulary size

from keras.layers import Input

# Define the model with an input shape
input_layer = Input(shape=(vocSize,))

hidden_layer = Dense(hidden_layer_size, activation='linear')(input_layer)
output_layer = Dense(vocSize, activation='sigmoid')(hidden_layer)

from keras.models import Model

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

model.fit(vx, vy, epochs=1000, batch_size=4,verbose=0)

model.predict(vector_x)

#[list(vectorizer.vocabulary_.keys())[0]]

#vectorizer.transform([list(vectorizer.vocabulary_.keys())[1]]).toarray()

#Extract the word vectors by fetching the intermediate layer values where the inputs are the vectors of each individual word.
layer_name = 'dense'
intermediate_layer_model = Model(inputs=model.input, outputs=hidden_layer)

# show the weights
w = intermediate_layer_model.get_weights()

#we are extracting the output of intermediate layer when we pass the one-hot-encoded version of the word as input.
words = []
wordVec = []
xval = []
yval = []
for i in range(len(vectorizer.vocabulary_)):
    words.append(list(vectorizer.vocabulary_.keys())[i])
    one_hot = vectorizer.transform([list(vectorizer.vocabulary_.keys())[i]]).toarray()
    wordVec.append(intermediate_layer_model.predict(one_hot))
    xval.append(wordVec[i][0][0])
    yval.append(wordVec[i][0][1])

#2D plot of the wordvec
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.pyplot import figure

figure(figsize=(10, 10), dpi=80)
plt.xlabel("Dim 1", fontsize = 15)
plt.ylabel("Dim 2",fontsize = 15)
plt.xlim([-4, 4])
plt.ylim([-4, 4])

for i in range(len(xval)):
  plt.text(xval[i], yval[i], words[i], fontsize = 12)

plt.plot(xval, yval,'go')
plt.show()

#Measuring similarity between word vectors
w2c = ["movie","football"]
word_vec_a = intermediate_layer_model.predict(vectorizer.transform([w2c[0]]).toarray())
word_vec_b = intermediate_layer_model.predict(vectorizer.transform([w2c[1]]).toarray())

#word_vec_a
#word_vec_b

#cosine similarity
np.sum(word_vec_a*word_vec_b)/((np.sqrt(np.sum(np.square(word_vec_a))))*np.sqrt(np.sum(np.square(word_vec_b))))

#The Eucledian distance between two different vectors, A and B, is calculated as follows
np.sum(np.square(word_vec_a - word_vec_b))





