In [1]:
from keras.preprocessing.text import Tokenizer
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 9 unique tokens.


In [8]:
from keras.datasets import imdb
from keras import preprocessing
max_features = 10000
maxlen = 20
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [10]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen))   # This layer returns a 3D floating-point tensor of shape (samples,sequence_
                                                      # length, embedding_dimensionality).
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()
history = model.fit(x_train, y_train,
epochs=10,
batch_size=32,
validation_split=0.2)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# IMDB Raw dataset

In [40]:
import warnings
warnings.filterwarnings("ignore")

embed_size = 100 # how big is each word vector
max_features = 10000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use
SEED = 2018

In [12]:
import pandas as pd
data = pd.read_csv("imdb_master.csv",encoding = "ISO-8859-1")

In [14]:
data["Flag"] =[1 if x =="pos" else 0 for x in  data["label"]]
train = data[data["type"]=="train"]
test = data[data["type"]=="test"]

In [45]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re
from sklearn.model_selection import train_test_split
import numpy as np

In [25]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [48]:
def load_and_prec(train_df,test_df):
    
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    # lower
    train_df["review"] = train_df["review"].apply(lambda x: x.lower())
    test_df["review"] = test_df["review"].apply(lambda x: x.lower())
    
    # Clean the text
    train_df["review"] = train_df["review"].apply(lambda x: clean_text(x))
    test_df["review"] = test_df["review"].apply(lambda x: clean_text(x))
    
    # Clean numbers
    train_df["review"] = train_df["review"].apply(lambda x: clean_numbers(x))
    test_df["review"] = test_df["review"].apply(lambda x: clean_numbers(x))
    
    # Clean speelings
    train_df["review"] = train_df["review"].apply(lambda x: replace_typical_misspell(x))
    test_df["review"] = test_df["review"].apply(lambda x: replace_typical_misspell(x))
    train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=2018)
    ## fill up the missing values
    train_X = train_df["review"].fillna("_##_").values
    val_X = val_df["review"].fillna("_##_").values
    test_X = test_df["review"].fillna("_##_").values
    
    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    val_X = pad_sequences(val_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['Flag'].values
    val_y = val_df['Flag'].values 
    test_y = test_df['Flag'].values
    #shuffling the data
    np.random.seed(SEED)
    trn_idx = np.random.permutation(len(train_X))
    val_idx = np.random.permutation(len(val_X))
    
    train_X = train_X[trn_idx]
    val_X = val_X[val_idx]
    train_y = train_y[trn_idx]
    val_y = val_y[val_idx]    
    
    return train_X, val_X, test_X, train_y, val_y,test_y, tokenizer.word_index

In [49]:
train_X, val_X, test_X, train_y, val_y, test_y,word_index = load_and_prec(train,test)

Train shape :  (75000, 6)
Test shape :  (25000, 6)


In [51]:
#def load_glove(word_index):
#    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
#    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
#    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
#
#    all_embs = np.stack(embeddings_index.values())
#    emb_mean,emb_std = all_embs.mean(), all_embs.std()
#    embed_size = all_embs.shape[1]
#
#    # word_index = tokenizer.word_index
#    nb_words = min(max_features, len(word_index))
#    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
#    for word, i in word_index.items():
#        if i >= max_features: continue
#        embedding_vector = embeddings_index.get(word)
#        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
#            
#    return embedding_matrix 
#    
#def load_fasttext(word_index):    
#    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
#    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
#    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)
#
#    all_embs = np.stack(embeddings_index.values())
#    emb_mean,emb_std = all_embs.mean(), all_embs.std()
#    embed_size = all_embs.shape[1]
#
#    # word_index = tokenizer.word_index
#    nb_words = min(max_features, len(word_index))
#    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
#    for word, i in word_index.items():
#        if i >= max_features: continue
#        embedding_vector = embeddings_index.get(word)
#        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
#
#    return embedding_matrix
#
#def load_para(word_index):
#    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
#    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
#    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)
#
#    all_embs = np.stack(embeddings_index.values())
#    emb_mean,emb_std = all_embs.mean(), all_embs.std()
#    embed_size = all_embs.shape[1]
#
#    # word_index = tokenizer.word_index
#    nb_words = min(max_features, len(word_index))
#    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
#    for word, i in word_index.items():
#        if i >= max_features: continue
#        embedding_vector = embeddings_index.get(word)
#        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
#    
#    return embedding_matrix

#from gensim.models import KeyedVectors
#
#EMBEDDING_FILE = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
#embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
#
#word_index = tokenizer.word_index
#nb_words = min(max_features, len(word_index))
#embedding_matrix_4 = (np.random.rand(nb_words, embed_size) - 0.5) / 5.0
#for word, i in word_index.items():
#    if i >= max_features: continue
#    if word in embeddings_index:
#        embedding_vector = embeddings_index.get_vector(word)
#        embedding_matrix_4[i] = embedding_vector

In [54]:
embeddings_index = {}
f = open('glove.6B.100d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [57]:
embedding_dim = 100
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [59]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=maxlen,))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [60]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [63]:
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['acc'])

history = model.fit(train_X, train_y,
epochs=10,
batch_size=32,
validation_data=(val_X, val_y))
model.save_weights('pre_trained_glove_model.h5')

Instructions for updating:
Use tf.cast instead.
Train on 60000 samples, validate on 15000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [64]:
model.load_weights('pre_trained_glove_model.h5')
model.evaluate(test_X, test_y)



[2.473443458850384, 0.54748]

In [None]:
# Improve by using RNN, LSTM, GRU, 1DConvnet, BiDirectional LSTM and GRU