In [1]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

# define documents
docs = ['Well done!',
        'Good work'
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

# integer encode the documents
vocab_size = 10
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=None, padding='post')
print(padded_docs)

# define the model
model = Sequential()
model.add(Embedding(vocab_size, 2, input_length=max_length))
# model.add(Flatten())
# model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

# summarize the model
print(model.summary())

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


[[1, 7], [4, 8, 7], [3, 8], [6], [8], [7, 7], [6, 4], [7, 8], [4, 4, 7, 5]]
[[1 7 0 0]
 [4 8 7 0]
 [3 8 0 0]
 [6 0 0 0]
 [8 0 0 0]
 [7 7 0 0]
 [6 4 0 0]
 [7 8 0 0]
 [4 4 7 5]]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 2)              20        
Total params: 20
Trainable params: 20
Non-trainable params: 0
_________________________________________________________________
None


In [2]:
import numpy as np

# model = Sequential()
# model.add(Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

# input_array = np.random.randint(1000, size=(32, 10))

# model.compile('rmsprop', 'mse')
output_array = model.predict(padded_docs)
# assert output_array.shape == (32, 10, 64)

In [3]:
model.get_weights()

[array([[-0.02619196,  0.03194037],
        [-0.00765017,  0.02035532],
        [-0.03479217, -0.01723608],
        [ 0.00864689,  0.02570535],
        [ 0.03251817, -0.01616557],
        [-0.04700295, -0.02093154],
        [-0.04162155,  0.0295226 ],
        [-0.0463048 , -0.01977211],
        [-0.03724138, -0.03286988],
        [-0.00580846, -0.02814339]], dtype=float32)]

In [4]:
# cos sim

model.get_config()

[{'class_name': 'Embedding',
  'config': {'activity_regularizer': None,
   'batch_input_shape': (None, 4),
   'dtype': 'float32',
   'embeddings_constraint': None,
   'embeddings_initializer': {'class_name': 'RandomUniform',
    'config': {'maxval': 0.05, 'minval': -0.05, 'seed': None}},
   'embeddings_regularizer': None,
   'input_dim': 10,
   'input_length': 4,
   'mask_zero': False,
   'name': 'embedding_1',
   'output_dim': 2,
   'trainable': True}}]

In [8]:
model = Sequential()
model.add(Embedding(25000, 100, input_length=100))
model.add(Flatten()) 
model.add(Dense(1))

In [9]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 100)          2500000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 10001     
Total params: 2,510,001
Trainable params: 2,510,001
Non-trainable params: 0
_________________________________________________________________


### Train embedding on IMDB

In [1]:
import pickle

def save_pickle(path, X):
    with open(path, 'wb') as f:
        pickle.dump(X, f)

def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_train = open_pickle("../../data/imdb/imdb_original_preprocessed_xtrain.pickle")
X_test = open_pickle("../../data/imdb/imdb_original_preprocessed_xtest.pickle")
y_train = open_pickle("../../data/imdb/imdb_original_preprocessed_ytrain.pickle")
y_test = open_pickle("../../data/imdb/imdb_original_preprocessed_ytest.pickle")

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=False, min_df=100)
cv.fit(X_train)
len(cv.get_feature_names())

3641

In [8]:
cv.vocabulary_

{'silent': 2903,
 'night': 2176,
 'deadly': 792,
 'is': 1705,
 'the': 3238,
 'very': 3461,
 'last': 1818,
 'of': 2233,
 'series': 2837,
 'and': 166,
 'like': 1875,
 'part': 2320,
 'it': 1708,
 'to': 3298,
 'first': 1238,
 'three': 3271,
 'except': 1095,
 'by': 444,
 'title': 3296,
 'fact': 1148,
 'that': 3236,
 'christmas': 555,
 'horror': 1555,
 'flick': 1251,
 'there': 3248,
 'some': 2969,
 'obvious': 2223,
 'thing': 3255,
 'going': 1381,
 'on': 2253,
 'here': 1499,
 'plays': 2399,
 'named': 2135,
 'joe': 1739,
 'his': 1521,
 'creepy': 733,
 'son': 2978,
 'name': 2134,
 'ring': 2708,
 'bell': 338,
 'anyone': 191,
 'now': 2207,
 'little': 1894,
 'boy': 410,
 'heard': 1478,
 'knock': 1791,
 'at': 239,
 'door': 920,
 'one': 2255,
 'evening': 1076,
 'opened': 2259,
 'find': 1228,
 'present': 2458,
 'for': 1274,
 'him': 1515,
 'even': 1075,
 'though': 3265,
 'said': 2754,
 'do': 909,
 'not': 2195,
 'open': 2258,
 'till': 3288,
 'he': 1473,
 'begins': 327,
 'anyway': 193,
 'but': 439,
 'st

In [12]:
import numpy as np

vocab_size = len(cv.get_feature_names())
print('Generate token sequence...')
token = r"(?u)\b[\w\'/]+\b"
X_tr_token = generate_token_sequence(X_train, cv.vocabulary_, token)

Generate token sequence...


In [13]:
def generate_token_sequence(X_corpus, word_dict, token):
    import re

    token_pattern = re.compile(token)
    X = []
    i=0
    for sentence in X_corpus:
        split = token_pattern.findall(sentence)
        seq = []
        for word in split:
            try:
                seq.append(word_dict[word])
            except KeyError:
                continue
        X.append(seq)

    return np.asarray(X) 

In [14]:
X_tr_token.shape

(25000,)

In [22]:
from keras.preprocessing.sequence import pad_sequences
max_len=10
x_train = pad_sequences(X_tr_token, maxlen=max_len, padding='post', truncating='post', value=0)

In [23]:
x_train.shape

(25000, 10)

In [24]:
x_train[0]

array([2903, 2176,  792, 2176, 1705, 3238, 3461, 1818, 2233, 3238])

In [109]:
voc_list = []
voc_list.append(cv.vocabulary_.get('good'))
voc_list.append(cv.vocabulary_.get('nice'))
voc_list.append(cv.vocabulary_.get('awesome'))
voc_list.append(cv.vocabulary_.get('terrible'))
voc_list.append(cv.vocabulary_.get('bad'))
voc_list.append(cv.vocabulary_.get('horrible'))

In [110]:
voc_list

[1386, 2173, 270, 3223, 278, 1551]

In [219]:
whole_weights = []

In [241]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from sklearn.metrics.pairwise import cosine_similarity

model = Sequential()
model.add(Embedding(vocab_size, 5, input_length=max_len))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
output_array = model.predict(x_train)

weights = np.asarray(model.get_weights())[0]

cos_sim = cosine_similarity(weights)

neighbors = []
for vocab in voc_list:
    vec = np.argsort(cos_sim[vocab,:])
    neighbors.append(vec[:100])
    
neighbors = np.asarray(neighbors)

In [242]:
whole_weights.append(neighbors)

In [243]:
len(whole_weights)

8

In [212]:
whole_weights = np.asarray(whole_weights)

In [213]:
whole_weights.shape

(10, 6, 50)

In [216]:
from functools import reduce

for word in range(whole_weights.shape[1]):
    word_set = []
    for embedding in range(whole_weights.shape[0]):
        word_set.append(whole_weights[embedding, word, :])
    word_set = np.asarray(word_set)
    
    intersect = reduce(np.intersect1d, (word_set[0], word_set[1]))
    
    for embedding in range(whole_weights.shape[0]):
        
        embedding = embedding + 2
        if embedding == 10:
            break
        intersect = reduce(np.intersect1d, (word_set[embedding], word_set[embedding]))
    print(intersect)

[]
[]
[]
[]
[]
[]


In [217]:
word_set.shape

(10, 50)

In [245]:
reduce(np.intersect1d, (word_set[2], word_set[3]))

array([], dtype=int64)

In [218]:
word_set

array([[1479, 3441, 1032, 1865,  128, 1544, 1900,  737, 1582, 3028, 2136,
        3158,  523,  754, 1895, 3333, 1833, 1192, 1496, 1640, 1260,  350,
        3603,  484, 1034, 2174, 2717, 1295,  679,  145, 3135, 1409, 2199,
        2245,  304,  301, 1046, 1608, 3245,  152, 2956, 1896, 1821, 2938,
        2230, 2670, 2640,  813, 1892, 3224],
       [1663,  130, 2762, 3253, 1350, 1334, 2743, 2226, 3279, 2914, 2132,
        3273,  275,  357, 2497, 3405, 2992, 1915,  544, 1579, 2711, 3240,
        1102, 2674, 2654, 2129,  428,  509, 2510,  938, 3060, 2673, 1931,
        2184, 2189,  582, 2333,  810,  282,  426,  935, 1302, 2618,   82,
        2615, 3364, 1537, 3369, 2469, 1412],
       [1479, 1716, 3056,  355, 2115,  561,  108,  687, 1364,  708, 3485,
         808,  589,  545,  863, 1162, 2951, 3317,  512, 1658, 1305, 1648,
        3563,  502, 3406,  406, 2085,  232, 2138, 1009, 2880, 1574, 2316,
        1529, 1039,  617, 1992, 2157,   73, 1894, 2352, 1052, 2489,  376,
        1899, 1112,  5