In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
import pickle
import numpy as np
from matplotlib import pyplot as plt
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
glove2word2vec(glove_input_file='glove.6B.300d.txt', word2vec_output_file='vectors.txt')
model = gensim.models.KeyedVectors.load_word2vec_format('vectors.txt', binary=False)

In [None]:
words = ['king', 'queen', 'man', 'woman', 'actor', 'actress']
print('Vector king: {}'.format(model['king']))

vec = np.empty((len(words), 300))
for i, w in enumerate(words):
    vec[i, :] = model[w]

from sklearn.manifold import TSNE

x = TSNE().fit_transform(vec)
fig, ax = plt.subplots()
ax.scatter(x[:, 0], x[:, 1])
for i, w in enumerate(words):
    ax.annotate(w, (x[i, 0], x[i, 1]))

plt.show()

## Entrenando Word2Vec

In [None]:
from keras.preprocessing.sequence import skipgrams
sentence = list(range(1, 7))
print(sentence)
x, y = skipgrams(sentence, 100, window_size=2, negative_samples=1.0, shuffle=False)
print('Skipgrams: ')
print(x)
print(y)

In [None]:
!pip install bs4
!pip install tqdm
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import re 

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text


import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

#Baja los stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]


def tokenizer_simple(text):
    return [w for w in re.split('\s+', text.strip()) \
            if re.match('[a-zA-Z]+', w)]


from sklearn.datasets import fetch_20newsgroups

categories = [
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey',
    'sci.crypt',
    'sci.electronics',
    'sci.med',
    'sci.space',
]

remove = ('headers', 'footers', 'quotes')

newsgroups = fetch_20newsgroups(subset='all', categories=categories,
                                     shuffle=True, random_state=0,
                                     remove=remove)

from collections import Counter, deque

def process_corpus(data, words_id=None, min_reps=5):
    corpus = []
    for text in tqdm(data):
        corpus.append(tokenizer_stem_nostop(preprocessor(text)))
        #corpus.append(tokenizer_simple(preprocessor(text)))
        
    if words_id is None:
        #Cuenta palabras en el corpus
        words = Counter()

        for s in corpus:
            for w in s:
                words[w] += 1

        #Elimina palabras con menos de 5 repeticiones
        words_id = {}
        id_next = 0
        for w, c in words.items():
            if c >= min_reps:
                words_id[w] = id_next
                id_next += 1

    id_words = { v:k for k, v in words_id.items()}
    corpus_id = [[words_id[w] for w in s if w in words_id] for s in corpus]
    return corpus_id, words_id, id_words

In [None]:
x = deque()
y = deque()

corpus_id, words_id, id_words = process_corpus(newsgroups.data)

#Crea los skipgrams de entrenamiento
from keras.preprocessing.sequence import skipgrams
for s in tqdm(corpus_id):
    x1, y1 = skipgrams(s, len(id_words), window_size=3, negative_samples=5)
    x.extend(x1)
    y.extend(y1)


x1, x2 = zip(*x)
import numpy as np
x1 = np.asarray(x1)
x2 = np.asarray(x2)
y = np.asarray(y)

print('Vocabulario: {}'.format(len(id_words)))
print('Skipgrams: {}'.format(x1.shape[0]))

In [None]:
from tensorflow.keras.layers import dot, Embedding, Input, Activation, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import backend as K

#Función de error basada en log-likelihood
def minus_max_likelihood(y_true, y_pred):
    max_like = y_true * K.log(1+ K.exp(-y_pred)) + (1 - y_true) * K.log(1+ K.exp(y_pred)) 
    return max_like

context_emb = Embedding(len(id_words), 64, name='Emb_context')
target_emb = Embedding(len(id_words), 64, name='Emb_target')

context = Input((1,), name='context')
emb = context_emb(context)
target = Input((1,), name='target')
embT = target_emb(target)
lam = dot([emb, embT], axes=(-1))
lam = Flatten()(lam) 
#lam = Activation('sigmoid')(lam)

model = Model(inputs=[context, target], outputs=lam)
model.compile('adam', minus_max_likelihood)
#model.compile('adam', 'binary_crossentropy')
model.summary()

#Entrenamos poco 
model.fit([x1, x2], y.astype(np.float32), epochs=1, batch_size=1000)
#Obtención de los embeddings
vectors = K.get_value(target_emb.embeddings)
#Recuperar memoria
del context_emb
del target_emb

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')
del model
from IPython.display import Image
Image(retina=True, filename='model.png')


In [None]:
print(x1[:10])
print(x2[:10])
print(y[:10])

In [None]:
def cos(v1, v2):
    return np.dot(v1, v2.T) / (np.dot(v1, v1.T) ** 0.5 * np.sum(v2 * v2, axis=-1) ** 0.5)


def nearest(voc, wv, top=11):
    dist = cos(wv, voc)
    a = range(len(dist))
    a = sorted(a, key=lambda x: dist[x], reverse=True)
    return a[0:top], [dist[x] for x in a[0:top]]

print('Similares a car:')
for i, d in zip(*nearest(vectors, vectors[words_id['car'], :])):
    print('\t{} {}'.format(id_words[i], d))

print('Similares a ford:')
for i, d in zip(*nearest(vectors, vectors[words_id['ford'], :])):
    print('\t{} {}'.format(id_words[i], d))

print('Similares a law:')
for i, d in zip(*nearest(vectors, vectors[words_id['law'], :])):
    print('\t{} {}'.format(id_words[i], d))

## GloVe


https://github.com/erwtokritos/keras-glove

In [None]:
from collections import defaultdict


def bigram_count(token_list, window_size, cache):
    sentence_size = len(token_list)

    for central_index, central_word_id in enumerate(token_list):
        for distance in range(1, window_size + 1):
            if central_index + distance < sentence_size:
                first_id, second_id = sorted([central_word_id, token_list[central_index + distance]])
                cache[first_id][second_id] += 1.0 / distance
    pass


def build_cooccurrences(sequences, cache, window=3):
    for seq in tqdm(sequences):
        bigram_count(token_list=seq, cache=cache, window_size=window)


def process_coocurrence_matrix(sentences, window_size=3):
    cache = defaultdict(lambda : defaultdict(int))

    build_cooccurrences(sentences, cache=cache, window=window_size)
    first, second, x_ijs = deque(), deque(), deque()

    for first_id in cache.keys():
        for second_id in cache[first_id].keys():
            x_ij = cache[first_id][second_id]

            first.append(first_id)
            second.append(second_id)
            x_ijs.append(x_ij)

            first.append(second_id)
            second.append(first_id)
            x_ijs.append(x_ij)

    return np.array(first), np.array(second), np.array(x_ijs)

In [None]:
x1, x2, y = process_coocurrence_matrix(corpus_id)

In [None]:
print(x1[:10])
print(x2[:10])
print(y[:10])
print(max(y))
print(np.mean(y))

In [None]:
from tensorflow.keras.layers import Input, Embedding, Dot, Reshape, Add
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

def custom_loss(y_true, y_pred, a = 3.0/4.0, X_MAX=100):
    """
    This is GloVe's loss function
    :param y_true: The actual values, in our case the 'observed' X_ij co-occurrence values
    :param y_pred: The predicted (log-)co-occurrences from the model
    :return: The loss associated with this batch
    """
    return K.sum(K.pow(K.clip(y_true / X_MAX, 0.0, 1.0), a) * K.square(y_pred - K.log(y_true)), axis=-1)


def glove_model(vocab_size=10, vector_dim=64):
    """
    A Keras implementation of the GloVe architecture
    :param vocab_size: The number of distinct words
    :param vector_dim: The vector dimension of each word
    :return:
    """
    input_target = Input((1,), name='central_word_id')
    input_context = Input((1,), name='context_word_id')

    central_embedding = Embedding(vocab_size+1, vector_dim, input_length=1, name='central_emb')
    central_bias = Embedding(vocab_size+1, 1, input_length=1, name='central_bias')

    context_embedding = Embedding(vocab_size, vector_dim, input_length=1, name='context_emb')
    context_bias = Embedding(vocab_size, 1, input_length=1, name='context_bias')

    vector_target = central_embedding(input_target)
    vector_context = context_embedding(input_context)

    bias_target = central_bias(input_target)
    bias_context = context_bias(input_context)

    dot_product = Dot(axes=-1)([vector_target, vector_context])
    dot_product = Reshape((1, ))(dot_product)
    bias_target = Reshape((1,))(bias_target)
    bias_context = Reshape((1,))(bias_context)

    prediction = Add()([dot_product, bias_target, bias_context])

    model = Model(inputs=[input_target, input_context], outputs=prediction)
    model.compile(loss=custom_loss, optimizer='adam')

    return model

In [None]:
model = glove_model(len(words_id), 64)
model.summary()

model.fit([x1, x2], y, epochs=5, batch_size=512)

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')
from IPython.display import Image
Image(retina=True, filename='model.png')

In [None]:
#Obtención de los embeddings
vectors = K.get_value(model.layers[2].embeddings)
#Recuperar memoria
#del model

print('Similares a car:')
for i, d in zip(*nearest(vectors, vectors[words_id['car'], :])):
    print('\t{} {}'.format(id_words[i], d))

print('Similares a ford:')
for i, d in zip(*nearest(vectors, vectors[words_id['ford'], :])):
    print('\t{} {}'.format(id_words[i], d))

print('Similares a law:')
for i, d in zip(*nearest(vectors, vectors[words_id['law'], :])):
    print('\t{} {}'.format(id_words[i], d))

## ¿Cómo usar los embeddings?


In [None]:
#Modelo pre entrenado
model = gensim.models.KeyedVectors.load_word2vec_format('vectors.txt', binary=False) 

In [None]:
x_train, y_train = fetch_20newsgroups(return_X_y=True, subset='train')
x_test, y_test = fetch_20newsgroups(return_X_y=True, subset='test')
x_train = list(map(preprocessor, x_train))
x_test = list(map(preprocessor, x_test))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

tfidf = TfidfVectorizer()
xp_train = tfidf.fit_transform(x_train)
xp_test = tfidf.transform(x_test)

In [None]:
cls = SVC()
cls.fit(xp_train, y_train)

print(classification_report(y_test, cls.predict(xp_test)))

In [None]:
def avg_vector(x, model):
    res = []
    for s in tqdm(x):
        s = tokenizer_simple(s)
        v = []
        for w in s:
            if w in model:
                v.append(model[w])
        res.append(np.mean(np.asarray(v), axis=0))
    return np.asarray(res)

xv_train = avg_vector(x_train, model)
xv_test = avg_vector(x_test, model)

cls = SVC()
cls.fit(xv_train, y_train)

print(classification_report(y_test, cls.predict(xv_test)))

In [None]:
def tfidf_vector(x, x_tfidf, tfidf, model):
    res = []
    tokenizer = tfidf.build_analyzer()
    tfidf_map = {w: i for i, w in enumerate(tfidf.get_feature_names())}
    for i, s in tqdm(enumerate(x), total=len(x)):
        s = tokenizer(s)
        v = []
        total = 0
        for w in s:
            if w in model:
                if w not in tfidf_map:
                    continue
                j = tfidf_map[w]
                total += x_tfidf[i, j]
                v.append(x_tfidf[i, j] * model[w])
        res.append(np.sum(np.asarray(v), axis=0) / total)
    return np.asarray(res)

xv_train = tfidf_vector(x_train, xp_train, tfidf, model)
xv_test = tfidf_vector(x_test, xp_test, tfidf, model)

cls = SVC()
cls.fit(xv_train, y_train)

print(classification_report(y_test, cls.predict(xv_test)))

## Recomendación

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

In [None]:
import pandas as pd
import numpy as np 

df = pd.read_csv('ml-latest-small/ratings.csv')
users = df.values[:, 0].astype(np.int32)
movies = df.values[:, 1].astype(np.int32)
m_ids = list(set(movies))
m_ids.sort()
m_ids = {idx: i for i, idx in enumerate(m_ids, start=1)}
movies = np.asarray([m_ids[i] for i in movies], dtype=np.int32)
ratings = df.values[:, 2]

In [None]:
from sklearn.model_selection import train_test_split

u_train, u_test, m_train, m_test, r_train, r_test = train_test_split(users, movies, ratings, random_state=42, test_size=0.1)

In [None]:
min_r = min(r_train)
max_r = max(r_train)

r_train = (r_train - min_r) / (max_r - min_r)
r_test = (r_test - min_r) / (max_r - min_r)

In [None]:
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

iu = Input((1,), name='user_i')
ue = Embedding(len(set(users))+1, 50, name='emb_user', embeddings_regularizer=l2(1e-6))(iu)
ub = Embedding(len(set(users))+1, 1, name='bias_user')(iu)

im = Input((1,), name='movie_i')
me = Embedding(len(set(movies))+1,50, name='emb_movie', embeddings_regularizer=l2(1e-6))(im)
mb = Embedding(len(set(movies))+1, 1, name='bias_movie')(im)

dot = Dot(axes=-1)([ue, me])

biases = Add()([dot, ub, mb])

out = Activation('sigmoid')(Flatten()(biases))

model = Model([iu, im], out)

model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['mae'])
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')
from IPython.display import Image
Image(retina=True, filename='model.png')

In [None]:
print(u_train[:10])
print(m_train[:10])
print(r_train[:10])

In [None]:
model.fit([np.expand_dims(u_train, axis=-1), np.expand_dims(m_train, axis=-1)], np.expand_dims(r_train, axis=-1),
          epochs=10, batch_size=512,
          validation_data=([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)], np.expand_dims(r_test, axis=-1)))

In [None]:
r_pred = model.predict([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)])

for t, p in zip(r_test[:100], r_pred[:100, 0]):
    print("Real: {} Predicho: {}".format(t * (max_r - min_r) + min_r, p* (max_r - min_r) + min_r))

In [None]:
from tensorflow.keras.layers import Dense, Concatenate, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

iu = Input((1,), name='user_i')
ue = Embedding(len(set(users))+1, 50, name='emb_user')(iu)

im = Input((1,), name='movie_i')
me = Embedding(len(set(movies))+1,50, name='emb_movie')(im)

f = Concatenate(axis=-1)([Flatten()(ue), Flatten()(me)])
f = Dropout(0.5)(f)

d = Dense(50)(f)
d = Dropout(0.5)(d)
d = Dense(1, activation='sigmoid')(d)

model = Model([iu, im], d)

model.compile(loss='mae', optimizer='nadam', metrics=['mae'])
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')
from IPython.display import Image
Image(retina=True, filename='model.png')

In [None]:
model.fit([np.expand_dims(u_train, axis=-1), np.expand_dims(m_train, axis=-1)], np.expand_dims(r_train, axis=-1),
          epochs=10, batch_size=128,
          validation_data=([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)], np.expand_dims(r_test, axis=-1)))

In [None]:
r_pred = model.predict([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)])

for t, p in zip(r_test[:100], r_pred[:100, 0]):
    print("Real: {} Predicho: {}".format(t * (max_r - min_r) + min_r, p* (max_r - min_r) + min_r))