https://www.kaggle.com/c/movie-review-sentiment-analysis-kernels-only/kernels

In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_colwidth',400)
train = pd.read_csv("train.tsv", sep="\t")
test = pd.read_csv("test.tsv", sep="\t")
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


0 - negative <br>
1 - somewhat neutral <br>
2 - neutral <br>
3 - somewhat positive <br>
4 - positive <br>
Each phrase has a phrase ID. Each sentence has a sentenceID

In [3]:
print(len(train.PhraseId.unique()))
print(len(train.SentenceId.unique()))

156060
8529


In [4]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

#designed to be a more adaptable tokenizer for things like tweets and writing

In [5]:
tokenizer = TweetTokenizer()
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(train['Phrase'].values) + list(test['Phrase'].values)

#done a bit of reading on this and it seems sometimes this is achieving better results in the field.
#best advice is to try both to be honest
vectorizer.fit(full_text)
train_vectorized = vectorizer.transform(train['Phrase'])
test_vectorized = vectorizer.transform(test['Phrase'])

In [6]:
y = train['Sentiment']

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)
ovr.fit(train_vectorized, y)



OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None)

In [38]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(ovr, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
np.mean(scores)

0.5655324688463249

Now to deep learning

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping
tokenizer = Tokenizer(lower = True)
tokenizer.fit_on_texts(full_text)

In [8]:
#replaces words with numeric indices
train_tokenized = tokenizer.texts_to_sequences(train['Phrase'])
test_tokenized = tokenizer.texts_to_sequences(test['Phrase'])

In [9]:
#need to pad them all to be the same length
max_len = 50
X_train = pad_sequences(train_tokenized, maxlen = max_len)
X_test = pad_sequences(test_tokenized, maxlen = max_len)

In [10]:
embedding_path = "crawl-300d-2M.vec"

In [11]:
embed_size = 300
max_features = 30000

In [12]:
#extract the word and the embedding vectors for each row in the file
embeddings_index = {}
f = open(embedding_path, encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 2000000 word vectors.


In [13]:
tokenizer.word_index

{'the': 1,
 'a': 2,
 'of': 3,
 'and': 4,
 'to': 5,
 "'s": 6,
 'in': 7,
 'is': 8,
 'that': 9,
 'it': 10,
 'as': 11,
 'with': 12,
 'for': 13,
 'its': 14,
 'film': 15,
 'an': 16,
 'movie': 17,
 'this': 18,
 'but': 19,
 'be': 20,
 'on': 21,
 'you': 22,
 'by': 23,
 "n't": 24,
 'more': 25,
 'his': 26,
 'not': 27,
 'one': 28,
 'than': 29,
 'about': 30,
 'at': 31,
 'from': 32,
 'or': 33,
 'all': 34,
 'like': 35,
 'are': 36,
 'have': 37,
 'has': 38,
 'so': 39,
 "'": 40,
 'out': 41,
 'story': 42,
 'who': 43,
 'rrb': 44,
 'up': 45,
 'too': 46,
 'good': 47,
 'most': 48,
 'into': 49,
 'lrb': 50,
 'time': 51,
 'much': 52,
 'what': 53,
 'if': 54,
 'characters': 55,
 'no': 56,
 'comedy': 57,
 'their': 58,
 'just': 59,
 'i': 60,
 'some': 61,
 'can': 62,
 'even': 63,
 'life': 64,
 'your': 65,
 'little': 66,
 'does': 67,
 "''": 68,
 'way': 69,
 'well': 70,
 'will': 71,
 'make': 72,
 'been': 73,
 'funny': 74,
 'only': 75,
 'very': 76,
 'he': 77,
 'do': 78,
 'director': 79,
 'any': 80,
 'enough': 81,
 'us'

In [14]:
embeddings_index.get("world")

array([ 0.0754,  0.0087,  0.0769, -0.3932, -0.0417, -0.108 , -0.3165,
        0.1129,  0.403 , -0.0546, -0.0775,  0.1394, -0.3437,  0.0056,
       -0.2149, -0.0792,  0.1391,  0.2636, -0.0282,  0.1147,  0.1472,
       -0.1183, -0.1531, -0.1508,  0.017 ,  0.1189, -0.0835,  0.0755,
        0.2867, -0.4668,  0.2741, -0.0912, -0.1706, -0.2259,  0.0597,
       -0.0526, -0.077 , -0.0305, -0.2019,  0.2152, -0.019 , -0.2326,
       -0.0863, -0.0484, -0.1292, -0.0007,  0.1289,  0.2168,  0.2151,
        0.0812,  0.1065, -0.0938,  0.425 , -0.0092,  0.1133,  0.0941,
       -0.0996,  0.0506,  0.0734, -0.0613,  0.1535, -0.1017,  0.0306,
        0.0815, -0.0224,  0.0576,  0.0482, -0.1198, -0.3098,  0.2956,
       -0.1275,  0.1479,  0.2167,  0.054 ,  0.0729, -0.154 , -0.2815,
       -0.1891, -0.1874,  0.2209,  0.2044,  0.0526, -0.1605, -0.0271,
        0.1313, -0.0396,  0.0286,  0.0649,  0.0636,  0.0853,  0.0248,
        0.3101, -0.0535, -0.0268, -0.0407, -0.2798,  0.1898, -0.0054,
       -0.0443,  0.1

In [15]:
#make the embedding matrix for all our words 
vocab_size = min(max_features, len(tokenizer.word_index))
embedding_matrix = np.zeros((vocab_size + 1, embed_size))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i,:] = embedding_vector

In [16]:
embedding_matrix.shape

(17781, 300)

In [17]:
#make the y variable one hot encoded
from keras.utils import to_categorical
y_ohe = to_categorical(y, num_classes=len(y.unique()))

In [20]:
#build a model
def build_model1(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    
    file_path = "best_model.hdf5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
    
    inp = Input(shape = (max_len,))
    x = Embedding(len(tokenizer.word_index) + 1, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    #this is a special kind of dropout that drops or keeps each row or column together depending on if you are dropping
    #rows or columns
    x1 = SpatialDropout1D(spatial_dr)(x)

    x_gru = Bidirectional(GRU(units, return_sequences = True))(x1)
    
    #Conv1D generally works well on text data
    """
    Conv1D takes care of neighboring words. A filter length of 5 would imply a context window of 5 words, 
    i.e, the word embeddings of 5 words, not 5 elements within a single embedding. 
    Images have height and width, so we use conv2d, sentences are linear lists of words, so conv1d. 
    The "2d" or "3d" specifies how we loop through the matrix; its not the rank of the convolution kernel itself.
    """
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x1)
    max_pool1_gru = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool3_gru = GlobalAveragePooling1D()(x3)
    max_pool3_gru = GlobalMaxPooling1D()(x3)
    
    
    
    x_lstm = Bidirectional(LSTM(units, return_sequences = True))(x1)
    
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x1)
    max_pool1_lstm = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool3_lstm = GlobalAveragePooling1D()(x3)
    max_pool3_lstm = GlobalMaxPooling1D()(x3)
    
    #Concatenates all the layers we have just made
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool3_gru, max_pool3_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool3_lstm, max_pool3_lstm])
    #Batch Normalization increases the speed for training
    #Often it is used between the linear and non-linear layers in your network because it normalizes
    #the input to your activation function but can be used anywhere you desire
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    
    #Final prediction layer
    x = Dense(5, activation = "sigmoid")(x)
    
    
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 1, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [21]:
model1 = build_model1(lr = 1e-3, lr_d = 1e-10, units = 64, spatial_dr = 0.3, kernel_size1=3, kernel_size2=2, dense_units=32, dr=0.1, conv_size=32)

Train on 140454 samples, validate on 15606 samples
Epoch 1/1

Epoch 00001: val_loss improved from inf to 0.31962, saving model to best_model.hdf5
