In [56]:
import tensorflow as tf
import numpy as np
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt

import tensorflow_hub as hub

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

from tensorflow.keras.layers import Dense
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Dot

from tensorflow.keras import layers

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.preprocessing import sequence

In [57]:
train_limpio = pd.read_csv("train_limpio_con_BOW_de_5000_y_Stemming_noDrops.csv",encoding = "ISO-8859-1")
test_limpio = pd.read_csv("test_limpio_con_BOW_de_5000_y_Stemming_noDrops.csv",encoding = "ISO-8859-1")

In [58]:
train_limpio["text"].head(1)

0    our deed are the reason of this earthquak may ...
Name: text, dtype: object

In [59]:
def addWordsToList(wordsList, string):
    for word in string.split(" "):
        wordsList.append(word)

words = []
train_limpio.loc[train_limpio.text.notnull()]["text"].apply(lambda x: addWordsToList(words, x))
words[:7]

['our', 'deed', 'are', 'the', 'reason', 'of', 'this']

In [60]:
VALIDATION_SIZE = .8
TEST_SIZE = .9
df = train_limpio
raw_train_df, raw_val_df, raw_test_df = np.split(df.sample(frac=1), [int(VALIDATION_SIZE*len(df)), int(TEST_SIZE*len(df))])

In [61]:
raw_train_df_no_target = raw_train_df[["text"]]
raw_val_df_no_target = raw_val_df[["text"]]
raw_test_df_no_target = raw_test_df[["text"]]

raw_train_df_target = raw_train_df[["target"]]
raw_val_df_target = raw_val_df[["target"]]
raw_test_df_target = raw_test_df[["target"]]

In [62]:
#Con esta funcion representamos cada palabra con un int unico y devolvemos una lista con las más comunes
#Un dic con la cantidad de cada una.

#Extract the top 10,000 most common words to include in our embedding vector
#Gather together all the unique words and index them with a unique integer value – this is what is required to create an equivalent one-hot type input for the word.  We’ll use a dictionary to do this
#Loop through every word in the dataset (vocabulary variable) and assign it to the unique integer word identified, created in Step 2 above.  This will allow easy lookup / processing of the word data stream
vocab_size = 10000


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words, vocab_size)

In [63]:
window_size = 3
vector_dim = 300
epochs = 1000000

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution. Son las más comunes.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [64]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

[[6489, 2615], [1778, 1614], [184, 20], [7939, 4448], [9643, 5063], [7682, 2730], [5303, 167], [5789, 5566], [5075, 6244], [1051, 7519]] [0, 1, 1, 0, 0, 0, 1, 0, 0, 0]


In [65]:
#len(list(raw_train_df_no_target.to_numpy(dtype="float32"))[0])

In [66]:
input_target = keras.Input((1,))
input_context = keras.Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')

target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

similarity = Dot(1, normalize=True)([target, context])

dot_product = Dot(1)([target, context])
dot_product = Reshape((1,))(dot_product)
# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)

model = Model([input_target, input_context], output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

validation_model = Model([input_target, input_context], similarity)

In [67]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        for i in range(vocab_size):
            in_arr1[0,] = valid_word_idx
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

In [None]:
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 100 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % 10000 == 0:
        sim_cb.run_sim()

Iteration 0, loss=0.6939670443534851
Nearest to go: bokoharam, broken, wbre, elbestia, orchestr, Ã£Â¥, crane, mariomaraczi,
Nearest to one: milkshak, obsolet, kit, yolk, idi, bowknot, farrakhan, zenandemcfen,
Nearest to famili: window, pan, usnwsgov, vita, dampen, mackayim, boatnew, hereÃ¢ÂÃ£ÂÃ¢,
Nearest to as: steep, eg, underground, theblaz, friggin, strain, marginoferror, Ã£Â¥,
Nearest to how: club, Ã£ÂÃ£Â, whirlwind, eri, deck, probe, raung, tree,
Nearest to after: bradley, christycroley, papcrdol, haiyan, director, doctorfluxx, Ã¢ÂÃ£ÂÃ£Â·hoax, worldwid,
Nearest to over: celticinde, drain, present, cnv, xmen, spec, drink, welfar,
Nearest to or: inbetween, thatÃ¢ÂÃ£ÂÃ¢, bethlehem, realest, diss, offend, lez, preview,
Nearest to than: warcraft, recoveri, chart, postbattl, forecast, imagin, hop, nonexist,
Nearest to via: shira, drothvad, iphooey, procedur, cam, season, vagersedolla, runner,
Nearest to at: thereof, vincent, eastward, mpc, fiona, matthew, constel, awesomelov,
Ne

In [None]:
#model1.evaluate(raw_test_df_no_target.to_numpy(dtype="uint"), raw_test_df_target.to_numpy(dtype="uint"))