In [2]:
import nltk
import gensim
import os
import numpy as np

In [3]:
## transform files from directory 

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
        self.stop = set(nltk.corpus.stopwords.words('english'))
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield [i for i in unicode(line, 'utf-8').lower().split() if i not in self.stop]
 
## file needs to have utf-8 encoding and each entity (question) must be a new line
sentences = MySentences('C:/Users/bbauer/Desktop/Neuer Ordner') # a memory-friendly iterator
model = gensim.models.Word2Vec(sentences)

In [4]:
## train the model
model = gensim.models.word2vec.Word2Vec(sentences, iter=10, min_count=4, window = 5, size=25, workers=4)

In [77]:
# get the most common words

for i in range(10):
    print(model.wv.index2word[i])

know
see
like
want
love
feel
im
"hi
get
going


In [6]:
# some similarity fun
print(model.wv.similarity('married', 'partner'))

0.5286053245101119


In [7]:
model.wv.most_similar("love")

[(u'true', 0.8646132946014404),
 (u'truly', 0.8112808465957642),
 (u'love.', 0.7119335532188416),
 (u'desire', 0.703091561794281),
 (u'forever', 0.691092848777771),
 (u'happiness', 0.6856699585914612),
 (u'real', 0.6750628352165222),
 (u'truely', 0.673657238483429),
 (u'soulmate', 0.6717501878738403),
 (u'partner,', 0.6608835458755493)]

In [8]:
model.wv.most_similar("money")

[(u'bills', 0.8869832754135132),
 (u'money.', 0.845816969871521),
 (u'debt', 0.8036943674087524),
 (u'property', 0.7937832474708557),
 (u'bills.', 0.7893637418746948),
 (u'$', 0.7865709066390991),
 (u'income', 0.7794181704521179),
 (u'cover', 0.7759976387023926),
 (u'rent', 0.7684590816497803),
 (u'debts', 0.7665688395500183)]

In [9]:
model.wv.most_similar("baby")

[(u'child', 0.8727279305458069),
 (u'father', 0.8460017442703247),
 (u'son', 0.8367814421653748),
 (u'mom', 0.821607768535614),
 (u'mother', 0.8206684589385986),
 (u'youngest', 0.8034934401512146),
 (u'dad', 0.802269697189331),
 (u'daughter', 0.7974161505699158),
 (u'kid', 0.7878149151802063),
 (u'kids', 0.7786372303962708)]

In [10]:
model.wv.most_similar("chakras")

[(u'chakra', 0.8600382208824158),
 (u'chakras.', 0.8585511445999146),
 (u'guided', 0.7996877431869507),
 (u'angelic', 0.7888866066932678),
 (u'cleansing', 0.7875677347183228),
 (u'cord', 0.7859991192817688),
 (u'aura', 0.7838760614395142),
 (u'recovery?"', 0.7739477157592773),
 (u'cleanse', 0.7633254528045654),
 (u'blockages', 0.7625334858894348)]

In [11]:
model.wv.most_similar("carrier")

[(u'carrer', 0.9075772166252136),
 (u'education', 0.8900696039199829),
 (u'finance', 0.8731331825256348),
 (u'studies', 0.8555986285209656),
 (u'sucessful', 0.8517412543296814),
 (u'academic', 0.8509454727172852),
 (u'accounting', 0.8422917127609253),
 (u'sector', 0.8409655094146729),
 (u'successful?"', 0.8398696780204773),
 (u'cosmetology', 0.8387865424156189)]

In [12]:
model.wv.most_similar("work")

[(u'work.', 0.8619329929351807),
 (u'working', 0.8003853559494019),
 (u'shift', 0.7973617315292358),
 (u'work,', 0.7732657194137573),
 (u'business', 0.768303632736206),
 (u'function', 0.7581888437271118),
 (u'part', 0.7326571941375732),
 (u'sort', 0.7282631993293762),
 (u'school', 0.7149479985237122),
 (u'discuss', 0.7033078670501709)]

In [13]:
model.wv.most_similar("christmas")

[(u'xmas', 0.8376498818397522),
 (u'christmas.', 0.8252521753311157),
 (u'holidays', 0.8088715672492981),
 (u'evening', 0.7942546010017395),
 (u'saturday', 0.7884491086006165),
 (u'meal', 0.7874487042427063),
 (u'nye', 0.7728686928749084),
 (u'weekend', 0.7668398022651672),
 (u'sunday', 0.7583177089691162),
 (u'dinner', 0.747183620929718)]

In [14]:
# which part doesn't fit
print(model.wv.doesnt_match("i work for money and joy".split()))

joy


In [60]:
# convert the input data into a list of integer indexes aligning with the wv indexes
# Read the data into a list of strings.

def convert_data_to_index(string_data, wv):
    index_data = []
    for word in string_data:
        if word in wv:
            index_data.append(wv.vocab[word].index)
    return index_data

str_data = ['this', 'is', 'a', 'hello', 'test', 'love']
index_data = convert_data_to_index(str_data, model.wv)
print(str_data[:], index_data[:])

(['this', 'is', 'a', 'hello', 'test', 'love'], [2164, 1535, 4])


In [72]:
model.wv.vocab['love'].index

4

In [64]:
embedding_matrix = np.zeros((len(model.wv.vocab), 25)) # 25 = size of word vector 
for i in range(len(model.wv.vocab)):
    embedding_vector = model.wv[model.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [65]:
embedding_matrix.shape

(55069L, 25L)

In [68]:
embedding_matrix[4]

array([ 1.69550574,  0.09775967,  1.43685794,  1.31089389, -0.90071911,
        0.43822721,  2.78542066, -3.53648949,  1.4707433 ,  1.79783607,
       -1.0285027 , -2.02234173, -2.15695143,  0.40224507, -0.2398791 ,
        1.23964202, -0.06145307,  1.47738457, -3.73910785, -0.49204144,
        4.01834393,  2.59854293, -0.88686156,  1.72344446,  1.45810735])

In [70]:
model.wv['love']

array([ 1.6955057 ,  0.09775967,  1.4368579 ,  1.3108939 , -0.9007191 ,
        0.4382272 ,  2.7854207 , -3.5364895 ,  1.4707433 ,  1.7978361 ,
       -1.0285027 , -2.0223417 , -2.1569514 ,  0.40224507, -0.2398791 ,
        1.239642  , -0.06145307,  1.4773846 , -3.7391078 , -0.49204144,
        4.018344  ,  2.598543  , -0.88686156,  1.7234445 ,  1.4581074 ],
      dtype=float32)

# text classification in Keras using pre trained embeddings

In [None]:
# source: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


#MAX_SEQUENCE_LENGTH = 1000
#MAX_NUM_WORDS = 20000
#EMBEDDING_DIM = 100
#VALIDATION_SPLIT = 0.2

## We will only consider the top 20,000 most commonly occuring words in the dataset, 
### and we will truncate the sequences to a maximum length of 1000 words.

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) 
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)

preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=2, batch_size=128)
## We can also test how well we would have performed by not using pre-trained word embeddings, 
# but instead initializing our Embedding layer from scratch and learning its weights during training. 
# We just need to replace our Embedding layer with the following:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH)