In [7]:
import numpy as np
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
path = './glove.6B/'
EMBEDDING_FILE = f'{path}glove.6B.50d.txt'

In [5]:
embed_size = 50
max_words = 20000
maxlen = 100

In [3]:
!head -n 5 glove.6B.50d.txt

head: glove.6B.50d.txt: No such file or directory


In [8]:
# build embedding index
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [9]:
# get mean of embeddings
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()

In [10]:
embeddings_index["o'clock"]

array([-0.48653  ,  0.1222   ,  0.25833  , -0.58008  ,  0.8847   ,
       -0.89723  , -0.022608 ,  0.55034  , -0.71687  , -0.88763  ,
       -0.76996  , -0.56762  , -0.0033389,  1.1139   ,  0.86669  ,
        0.063128 , -1.1099   ,  0.69595  , -1.162    , -0.0046839,
        1.0596   ,  0.52938  ,  1.7711   ,  0.85981  ,  0.53337  ,
        0.38279  , -0.16643  ,  0.66045  ,  0.6289   , -0.52657  ,
        1.3269   ,  0.80983  , -0.03682  ,  0.7566   ,  0.26205  ,
        0.1616   ,  1.1615   , -0.6152   ,  0.26574  ,  0.013689 ,
       -0.6277   , -0.70538  , -1.0315   , -0.19601  , -0.32116  ,
        0.20999  ,  0.77054  , -0.17916  ,  0.075172 ,  0.87148  ],
      dtype=float32)

In [11]:
# now we can build sentence embedding by just doing average of words embeddings

sentences = [
        'John is a good person and great husband',
        'ALice loves apples and oranges',
        'They hate bad breath, but dont do anything about it',
        'We are sitting at airport',
        'Boys are girls were playing cricker',
        'Football is the best sports in some parts and cricket in some part'
    ]

In [18]:
# get probability of each word
token_counts = {}
total_tokens = 0
for sentence in sentences:
  tokens = sentence.split()
  total_tokens += len(tokens)
  for token in tokens:
    token_count = token_counts.get(token, 0)
    token_counts[token] = token_count + 1

In [19]:
print(total_tokens)
token_counts

47


{'ALice': 1,
 'Boys': 1,
 'Football': 1,
 'John': 1,
 'They': 1,
 'We': 1,
 'a': 1,
 'about': 1,
 'airport': 1,
 'and': 3,
 'anything': 1,
 'apples': 1,
 'are': 2,
 'at': 1,
 'bad': 1,
 'best': 1,
 'breath,': 1,
 'but': 1,
 'cricker': 1,
 'cricket': 1,
 'do': 1,
 'dont': 1,
 'girls': 1,
 'good': 1,
 'great': 1,
 'hate': 1,
 'husband': 1,
 'in': 2,
 'is': 2,
 'it': 1,
 'loves': 1,
 'oranges': 1,
 'part': 1,
 'parts': 1,
 'person': 1,
 'playing': 1,
 'sitting': 1,
 'some': 2,
 'sports': 1,
 'the': 1,
 'were': 1}

In [24]:
token_probs = {}
for key, item in token_counts.items():
    token_probs[key] = item / total_tokens
token_probs


{'ALice': 0.02127659574468085,
 'Boys': 0.02127659574468085,
 'Football': 0.02127659574468085,
 'John': 0.02127659574468085,
 'They': 0.02127659574468085,
 'We': 0.02127659574468085,
 'a': 0.02127659574468085,
 'about': 0.02127659574468085,
 'airport': 0.02127659574468085,
 'and': 0.06382978723404255,
 'anything': 0.02127659574468085,
 'apples': 0.02127659574468085,
 'are': 0.0425531914893617,
 'at': 0.02127659574468085,
 'bad': 0.02127659574468085,
 'best': 0.02127659574468085,
 'breath,': 0.02127659574468085,
 'but': 0.02127659574468085,
 'cricker': 0.02127659574468085,
 'cricket': 0.02127659574468085,
 'do': 0.02127659574468085,
 'dont': 0.02127659574468085,
 'girls': 0.02127659574468085,
 'good': 0.02127659574468085,
 'great': 0.02127659574468085,
 'hate': 0.02127659574468085,
 'husband': 0.02127659574468085,
 'in': 0.0425531914893617,
 'is': 0.0425531914893617,
 'it': 0.02127659574468085,
 'loves': 0.02127659574468085,
 'oranges': 0.02127659574468085,
 'part': 0.02127659574468085,

In [25]:
np.zeros(4)

array([0., 0., 0., 0.])

In [31]:
# def sentence_embeddings(sentence):
alpha = 1.0
sentence = sentences[0]
tokens = sentence.split()

token_embeddings = np.random.normal(emb_mean, emb_std, (len(tokens), embed_size))
token_coef = np.zeros(len(tokens))
sentence_embedding = np.zeros(embed_size)
for i, token in enumerate(tokens):
    token_coef = alpha / (alpha + token_probs[token])
    embedding_vector = embeddings_index.get(token, np.random.normal(emb_mean, emb_std, (embed_size)))
    sentence_embedding += token_coef * embedding_vector                         

# sentence_embedding = np.mean(token_embeddings, axis=0))                            
sentence_embedding /= len(tokens)

# get first principal compnonent and do vs ← vs − u(u.t)vs

sentence_embedding




array([ 0.16101464,  0.51786323, -0.38713278, -0.17883638,  0.86980614,
        0.39941246, -0.27889833, -0.00501197, -0.00567496,  0.03640601,
       -0.03825773,  0.21203098, -0.18873311, -0.02988734,  0.2721746 ,
        0.05865019, -0.03713701,  0.32011013, -0.31438488, -0.18834033,
       -0.16944376,  0.44106726, -0.09808517, -0.05459551,  0.46569656,
       -1.52304722, -0.3404218 , -0.02537926,  0.44242231,  0.02128208,
        2.94470741,  0.05870609, -0.11448792, -0.09854385,  0.19172827,
        0.03644891,  0.01984152,  0.19727621,  0.38701765, -0.17322937,
       -0.01512535,  0.22884634, -0.19574301,  0.13093162, -0.05403942,
       -0.00365665, -0.15386787, -0.51251356,  0.04755753,  0.23650279])

In [26]:
 np.random.normal(0, 1, (2))

array([-1.01505255, -0.92710008])

In [26]:
a = {'a': 21, 'b': 32}
a.get('a1', 32)

32

In [30]:
a = [[1.3, 2, 3], [1.3, 2, 3], [1.3, 2, 31]]
a = np.asarray(a)
type(a)

numpy.ndarray

In [31]:
a.mean(axis=0)

array([ 1.3       ,  2.        , 12.33333333])

In [2]:
import numpy as np
a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6)
U, s, V = np.linalg.svd(a, full_matrices=True)

In [4]:
U.shape

(9, 9)

In [6]:
np.var(U, axis=1)

array([0.10322088, 0.10600236, 0.10393291, 0.11087369, 0.08096979,
       0.10535454, 0.07988012, 0.09054558, 0.10810903])

In [22]:
a = np.array([1, 2, 3])
print(np.matmul(np.expand_dims(a, axis=-1), a.reshape([1, 3])))
print(a)
np.expand_dims(a, axis=-1)

[[1 2 3]
 [2 4 6]
 [3 6 9]]
[1 2 3]


array([[1],
       [2],
       [3]])

In [20]:
a = a.reshape([1, a.shape[0]])
a

array([[1, 2, 3]])