In [189]:
import numpy as np
import scipy as sp
import pandas as pd
import scipy.spatial

In [190]:
with open("/data/glove.6B.100d.txt") as f:
    lines = f.readlines()
lines[:1], len(lines)

(['the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062\n'],
 400000)

In [191]:
size = 100
words = []
vocab_by_word = {}
embeddings = np.zeros([len(lines), size])
i = 0 
for line in lines:
    parts = line.split()
    if len(parts) == size + 1:
        words.append(parts[0])
        vocab_by_word[parts[0]] = i
        embeddings[i] = np.asarray(parts[1:], dtype=np.float32)
        i = i + 1
    else:
        print("Malformed", line)
words = np.array(words)

In [192]:
vocab_by_word["unk"]

201534

In [193]:
vocab_by_word["frog"], vocab_by_word["rana"]

(12260, 18711)

In [195]:
sp.spatial.distance.cosine(embeddings[12260], embeddings[18711])

0.7052680176083754

In [196]:
def find_similar_words(word, count = 5):
    target = vocab_by_word[word]
    similiarities = pd.Series(np.zeros([len(embeddings)]))
    for i in range(len(embeddings)):
        similiarities[i] = sp.spatial.distance.cosine(embeddings[target], embeddings[i])
    return words[similiarities.sort_values()[:count+1].index][1:]

In [197]:
find_similar_words("frog")

array(['toad', 'snake', 'frogs', 'monkey', 'turtle'], dtype='<U68')

In [198]:
find_similar_words("snake")

array(['snakes', 'lizard', 'spider', 'rat', 'frog'], dtype='<U68')

In [52]:
find_similar_words("apple")

array(['apple', 'microsoft', 'ibm', 'intel', 'software'], dtype='<U68')

In [54]:
find_similar_words("boy")

array(['boy', 'girl', 'man', 'kid', 'woman'], dtype='<U68')

In [55]:
find_similar_words("husband")

array(['husband', 'wife', 'mother', 'daughter', 'father'], dtype='<U68')

In [56]:
find_similar_words("queen")

array(['queen', 'princess', 'king', 'elizabeth', 'royal'], dtype='<U68')

In [58]:
find_similar_words("kiss")

array(['goodbye', 'hug', 'kisses', 'love', 'cry'], dtype='<U68')

In [199]:
find_similar_words("good")

array(['better', 'sure', 'really', 'kind', 'very'], dtype='<U68')

In [200]:
from bs4 import BeautifulSoup
import re

def preprocess(text):
    text = BeautifulSoup(text.lower(), "html5lib").text #removed html tags
    text = re.sub(r"[\W]+", " ", text)
    return text

def load_imdb(path): 
    import json
    data = []
    with open(path, "r", encoding="utf8") as f:
        for l in f.readlines():
            data.append(json.loads(l))
        comments = pd.DataFrame.from_dict(data)
        comments["content"] = comments["content"].apply(preprocess)
        return comments
        
comments = load_imdb("/data/imdb-comments.json")

In [201]:
comments.head()

Unnamed: 0,content,label,name,sentiment
0,i went and saw this movie last night after bei...,test,0_10.txt,pos
1,actor turned director bill paxton follows up h...,test,10000_7.txt,pos
2,as a recreational golfer with some knowledge o...,test,10001_9.txt,pos
3,i saw this film in a sneak preview and it is d...,test,10002_8.txt,pos
4,bill paxton has taken the true story of the 19...,test,10003_8.txt,pos


In [202]:
word_counts = comments.content.apply(lambda t: len(t.split()))
word_counts.sort_values(ascending = False)[:10]

26954    2498
10995    2303
5890     2178
5612     2152
25049    1854
31258    1829
30917    1762
9776     1746
29346    1618
41033    1550
Name: content, dtype: int64

In [203]:
len(comments.iloc[26954].content.split())

2498

In [184]:
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras import Sequential
from keras.layers import Embedding, Dense, Flatten, Dropout

In [204]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(comments["content"])
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(comments["content"])
type(encoded_docs), vocab_size

(list, 103891)

In [205]:
str(encoded_docs[:1])[:100]

'[[9, 416, 2, 210, 10, 15, 238, 311, 100, 109, 28203, 5, 33, 3, 173, 352, 4, 1758, 9, 232, 975, 11, 9'

In [170]:
max_length = 2000
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
padded_docs.shape

(50000, 2000)

In [168]:
str(tokenizer.word_index)[:100]

"{'the': 1, 'and': 2, 'a': 3, 'of': 4, 'to': 5, 'is': 6, 'it': 7, 'in': 8, 'i': 9, 'this': 10, 'that'"

In [177]:
num_words = min(len(words), len(tokenizer.word_index) + 1)
num_words

103891

In [178]:
embedding_matrix = np.zeros((num_words, embeddings.shape[1]))
for word, i in tokenizer.word_index.items():
    if i < max_length:
        word_idx_in_embeddings = vocab_by_word.get(word)
        if word_idx_in_embeddings is not None:
            embedding_matrix[i] = embeddings[word_idx_in_embeddings]
        else:
            print("%s is not in word embedding" % word)
embedding_matrix.shape

hadn is not in word embedding


(103891, 100)

In [164]:
comments.sentiment.unique()
labels = np.where(comments.sentiment == "pos", 1, 0)

In [None]:

def build_model(max_length, embeddings):    
    e = Embedding(embedding_matrix.shape[0]
                  , embedding_matrix.shape[1]
                  , weights=[embedding_matrix]
                  , input_length=max_length
                  , trainable=False)
    
    model = Sequential()
    model.add(e)
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.8))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy"])
    return model


model = build_model(max_length, embeddings)
model.summary()

validation_data = padded_docs[comments.label == "test"], labels[comments.label == "test"]

model.fit(padded_docs[comments.label == "train"]
          , labels[comments.label == "train"]
          , validation_data= validation_data
          , batch_size=32
          , epochs=5)

loss, accuracy = model.evaluate(padded_docs[comments.label == "test"], labels[comments.label == "test"])
print('Accuracy: %f' % (accuracy*100))

Max len: 2000
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_31 (Embedding)     (None, 2000, 100)         10389100  
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 1996, 128)         64128     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 399, 128)          0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 395, 128)          82048     
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 79, 128)           0         
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 75, 128)           82048     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 128)               0      