# Training own embeddings using skip-gram algorithm

## Own embeddings can be created using CBOW as well as Skip-gram
## CBOW is generally used when we have to predict a word in a particular place. 

## Embeddings created using skip-gram are generally used to predict the context word from target word 

### Here I'll be using skip-gram to train my own embeddings.

In [1]:
import pandas as pd
import keras
from keras.preprocessing import text
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv("train.tsv" , delimiter = '\t')

In [3]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
PhraseId      156060 non-null int64
SentenceId    156060 non-null int64
Phrase        156060 non-null object
Sentiment     156060 non-null int64
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [5]:
target = df['Phrase']

### Cleanig the data

In [6]:
from nltk.stem import WordNetLemmatizer
import re

corpus = []
c = df['SentenceId'].unique()
n = 0
for i in df['SentenceId'] : 
    review = re.sub('[^a-zA-Z]',' ',df['Phrase'][n]) 
    review = review.lower()
    review = review.split()
    lemmatizer = WordNetLemmatizer()
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)
    n = n+1
print(len(corpus))
target = corpus

156060


In [7]:
tokenizer = text.Tokenizer()

In [8]:
c = tokenizer.fit_on_texts(target)

In [9]:
word2id = tokenizer.word_index

In [10]:
d = list(word2id)

In [11]:
# print(word2id)

In [12]:
id2word = {v:k for k, v in word2id.items()}

In [13]:
type(id2word)

dict

In [14]:
# print(id2word)

In [15]:
vocab_size = len(word2id) + 1 
embed_size = 100

wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in target]
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 13747
Vocabulary Sample: [('the', 1), ('a', 2), ('of', 3), ('and', 4), ('to', 5), ('it', 6), ('s', 7), ('in', 8), ('is', 9), ('that', 10)]


In [16]:
from keras.preprocessing.sequence import skipgrams

In [17]:
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids]

In [18]:
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          id2word[pairs[i][0]], pairs[i][0], 
          id2word[pairs[i][1]], pairs[i][1], 
          labels[i]))

(some (65), adept (12716)) -> 0
(but (17), memorial (7308)) -> 0
(of (3), drowsy (6377)) -> 0
(is (9), many (109)) -> 0
(for (14), for (14)) -> 1
(a (2), good (47)) -> 1
(the (1), shakespeare (2149)) -> 0
(goose (2849), wollter (3763)) -> 0
(escapade (6418), the (1)) -> 1
(goose (2849), gibberish (11500)) -> 0


# Build skip gram model archi

In [19]:
import keras.backend as K

In [20]:
from keras.layers.core import Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.models import Sequential,Model
from keras.layers import Input, Dense,Dot

word_model = Sequential()
word_model.add(Embedding(vocab_size, embed_size,
                         embeddings_initializer="glorot_uniform",
                         input_length=1))
word_model.add(Reshape((embed_size, )))

context_model = Sequential()
context_model.add(Embedding(vocab_size, embed_size,
                  embeddings_initializer="glorot_uniform",
                  input_length=1))
context_model.add(Reshape((embed_size,)))



fin = [word_model.output, context_model.output]
# model = Sequential()
# model.add(Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid"))(Dot(-1)(fin))
# model.compile(loss="mean_squared_error", optimizer="rmsprop")

dot = Dot(-1)(fin)
out = Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid")(dot)

model_in = [word_model.input, context_model.input]
model = Model(inputs=model_in, outputs=out)

In [21]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_1_input (InputLayer)  (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_2_input (InputLayer)  (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 100)       1374700     embedding_1_input[0][0]          
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 100)       1374700     embedding_2_input[0][0]          
__________________________________________________________________________________________________
reshape_1 

In [22]:
model.compile(loss="mean_squared_error", optimizer="rmsprop")

In [26]:
for i, elem in enumerate(skip_grams):
    if (len(elem[0]) == 0):
        break
    first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32').reshape(-1,1)
    second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32').reshape(-1,1)
    labels = np.array(elem[1], dtype='int32')
#     X = np.array([first_elem, second_elem])
    X = [first_elem, second_elem]
    Y = labels
    model.fit(X,Y)

Epoch 1/1
Epoch 1/1
Epoch 1/1


In [61]:
word_embed_layer = word_model.layers[0]
weights = word_embed_layer.get_weights()[0][1:]

print(weights.shape)
# word_model = merge_layer.layers[0]
# word_embed_layer = word_model.layers[0]
# weights = word_embed_layer.get_weights()[0][1:]

(13746, 100)


In [45]:
model.layers[-2]

<keras.layers.merge.Dot at 0x17827a90a90>