# UM Sentiment analysis (Kaggle)
Will use word embeddings from Glove

In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import time
import nltk
import glove
SENTENCE_LENGTH_MAX = 32
EMBEDDING_DIM=50
TARGET_SIZE = 2 # Positive or Negative

In [35]:
corpus_dir = './data/'
data_file = os.path.join(corpus_dir, 'training.txt')
testing_file = os.path.join(corpus_dir, 'testdata.txt')

sentence_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize("This is Mr. Smith's tokenized test.")

['This', 'is', 'Mr.', 'Smith', "'s", 'tokenized', 'test', '.']

In [39]:
import random
fin = open(data_file,'rb')
training_file_name = "./data/training_data.txt"
val_file_name = "./data/validation_data.txt"
training = open(training_file_name, 'wb')
validation = open(val_file_name, 'wb')
for line in fin:
    r = random.random()
    if r < 0.75:
        training.write(line)
    else:
        validation.write(line)
fin.close()
training.close()
validation.close()

In [43]:
data = pd.read_table(data_file, names = ['target','sentence'])
training = pd.read_table(training_file_name, names = ['target','sentence'])
testing = pd.read_table(val_file_name, names = ['target','sentence'])


## Do word embeddings

In [44]:
def corpus_sentence_tokens(corpus_text_file = training_file_name):
    while True:
        with open(corpus_text_file, encoding='utf-8') as f:
            for line in f.readlines():
                sentiment,l = line.split('\t')   # Strip of the initial numbers
                for s in sentence_splitter.tokenize(l):  # Split the lines into sentences (~1 each)
                    tree_banked = tokenizer.tokenize(s)
                    if len(tree_banked) < SENTENCE_LENGTH_MAX:
                        yield (tree_banked, sentiment)
        print("Corpus : Looping")
corpus_sentence_tokens_gen = corpus_sentence_tokens()

In [45]:
next(corpus_sentence_tokens_gen)

(['The', 'Da', 'Vinci', 'Code', 'book', 'is', 'just', 'awesome', '.'], '1')

In [46]:
import glove
glove_corpus = glove.Corpus()

corpus_sentences = [ 
        [ w.lower() for w in next(corpus_sentence_tokens_gen)[0]] # All lower-case
        for _ in range(0,len(training)) 
    ]

# Fit the co-occurrence matrix using a sliding window of 10 words.
t0 = time.time()
glove_corpus.fit(corpus_sentences, window=10)
print("Dictionary length=%d" % (len(glove_corpus.dictionary),))
print("Co-occurrence calculated in %5.1fsec" % (time.time()-t0, ))

Dictionary length=1760
Co-occurrence calculated in   0.0sec


In [47]:
glove_corpus.dictionary['books']

13

In [48]:
word_embedding = glove.Glove(no_components=EMBEDDING_DIM, learning_rate=0.05)

t0 = time.time()
glove_epochs, glove_threads = 20, 4 

word_embedding.fit(glove_corpus.matrix, epochs=glove_epochs, no_threads=glove_threads, verbose=True)

print("%d-d word-embedding created in %5.1fsec = %5.1fsec per epoch" % (
        EMBEDDING_DIM, (time.time()-t0), (time.time()-t0)/glove_epochs*glove_threads, ))

# Add the word -> id dictionary to the model to allow similarity queries.
word_embedding.add_dictionary(glove_corpus.dictionary)

Performing 20 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
50-d word-embedding created in   0.2sec =   0.0sec per epoch


In [49]:
word_embedding.most_similar('vinci')

[('code', 0.98955089805199314),
 ('da', 0.98891475283066421),
 ('sucked..', 0.94839774445000757),
 ('sucked', 0.90613512548569974)]

In [50]:
# word-analogy test
def get_embedding_vec(word):
    idx = word_embedding.dictionary.get(word.lower(), -1)
    if idx<0:
        #print("Missing word : '%s'" % (word,))
        return np.zeros(  (EMBEDDING_DIM, ), dtype='float32')  # UNK
    return word_embedding.word_vectors[idx]

def get_closest_word(vec, number=5):
    dst = (np.dot(word_embedding.word_vectors, vec)
                   / np.linalg.norm(word_embedding.word_vectors, axis=1)
                   / np.linalg.norm(vec))
    word_ids = np.argsort(-dst)
    return [(word_embedding.inverse_dictionary[x], dst[x]) for x in word_ids[:number]
            if x in word_embedding.inverse_dictionary]

In [51]:
analogy_vec = get_embedding_vec('da') + get_embedding_vec('vinci') - get_embedding_vec('movie')
get_closest_word(analogy_vec)

[('da', 0.95469931059057989),
 ('vinci', 0.92913150272767508),
 ('code', 0.87984801284888314),
 ('the', 0.87069663768735828),
 ('sucked..', 0.85681709044768961)]

In [52]:
def test_analogy(s='one two three four'):
    (a,b,c,d) = s.split(' ')
    analogy_vec = get_embedding_vec(b) - get_embedding_vec(a) + get_embedding_vec(c)
    words = [ w for (w,p) in get_closest_word(analogy_vec) if w not in (a,b,c)]
    print("'%s' is to '%s' as '%s' is to {%s}" % (a,b,c,', '.join(words)))

In [53]:
test_analogy('vinci movie harry amazing')

'vinci' is to 'movie' as 'harry' is to {hate, potter, because, really}


In [54]:
glove_embeddings_dir = '../../repos/deep-learning-workshop/notebooks/5-RNN/data/RNN/'
glove_file = 'glove.first-100k.6B.50d.txt'
glove_file_path = os.path.join(glove_embeddings_dir, glove_file)
if os.path.isfile(glove_file_path):
    print('Glove embeddings available locally')

glove_embedding = glove.Glove.load_stanford( glove_file_path )
glove_embedding.word_vectors.shape

Glove embeddings available locally


(100000, 50)

In [55]:
word_embedding.most_similar('harry')

[('potter', 0.98377129783880202),
 ('these', 0.94847748986254254),
 ('hate', 0.93135211004922358),
 ('does', 0.86871969383023206)]

In [56]:
glove_embedding.dictionary.get('plausible',-1)

18736

## Build a RNN model for sentiment

In [57]:
BATCH_SIZE = 64
RNN_HIDDEN_SIZE = EMBEDDING_DIM # ?+1 for capitalisation flag

In [58]:

def get_rnn_embeddings(embedding_vector):
    word_embedding_rnn = np.vstack([ 
        np.zeros( (1, EMBEDDING_DIM,), dtype='float32'),   # This is the 'zero' value (used as a mask in Keras)
        np.zeros( (1, EMBEDDING_DIM,), dtype='float32'),   # This is for 'UNK'  (word == 1)
        embedding_vector.word_vectors,
    ])
    print(word_embedding_rnn.shape)
    return word_embedding_rnn

def get_word_embeddings(word, embedding_vector):
    idx = embedding_vector.dictionary.get(word.lower(), -1)  # since UNK=1 = (-1+2)
    return idx+2

def sentence_inputs_and_outputs(sentences, embedding_vector, one_hot_targets = False):
    from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical
    len_of_list = len(sentences)
    
    input_values = np.zeros((len_of_list, SENTENCE_LENGTH_MAX), dtype='int32')
    if one_hot_targets:
        # Add extra dimension here to suit Keras' TimeDistributed(Dense(softmax))
        #   as discussed : https://github.com/fchollet/keras/issues/6363
        target_values  = np.zeros((len_of_list, SENTENCE_LENGTH_MAX, TARGET_SIZE), dtype='int32')
    else:
        target_values  = np.zeros((len_of_list, SENTENCE_LENGTH_MAX), dtype='int32')
        
    for i, (sent, sentiment_tag) in enumerate(sentences):
        for j, word in enumerate(sent):
            input_values[i,j] = get_word_embeddings(word, embedding_vector)
        if one_hot_targets:
            target_values[i, j] = to_categorical(int(sentiment_tag), num_classes=TARGET_SIZE)
        else:
            target_values[i,j] = int(sentiment_tag)
    
    return (input_values, target_values)
    
def batch_for_network_generator(embedding_vector):
    while True:
        batch_of_sentences = [ next(corpus_sentence_tokens_gen) for i in range(BATCH_SIZE) ]    
        yield sentence_inputs_and_outputs(batch_of_sentences, embedding_vector, one_hot_targets=True)


In [59]:
single_batch_input, single_batch_targets = next(batch_for_network_generator(glove_embedding))
single_batch_input.shape, single_batch_targets.shape
#single_batch_input[0]
#single_batch_targets[0]

((64, 32), (64, 32, 2))

In [60]:
#from tensorflow.contrib.keras.api.keras.preprocessing import sequence
from tensorflow.contrib.keras.api.keras.layers import Input, Embedding, GRU, Dense #, Activation
from tensorflow.contrib.keras.api.keras.models import Model

# Hmm : The following is not in the API...
from tensorflow.contrib.keras.python.keras.layers import Bidirectional, TimeDistributed

In [61]:
tokens_input = Input(shape=(SENTENCE_LENGTH_MAX,), dtype='int32', name="SentencesTokens")
embedding_vector = glove_embedding
my_embedding_vector = get_rnn_embeddings(embedding_vector)
# load pre-trained word embeddings into an Embedding layer
#   note that we set trainable = False so as to keep the embeddings fixed
embedded_sequences = Embedding(my_embedding_vector.shape[0],
                                EMBEDDING_DIM,
                                weights=[ my_embedding_vector ],
                                input_length=SENTENCE_LENGTH_MAX,
                                trainable=False, 
                                mask_zero=True,
                                name="SentencesEmbedded") (tokens_input)

#extra_input = ...
aggregate_vectors = embedded_sequences # concat...

rnn_outputs = Bidirectional( GRU(RNN_HIDDEN_SIZE, return_sequences=True),  merge_mode='concat' )(aggregate_vectors)

is_ner_outputs  = TimeDistributed( Dense(TARGET_SIZE, activation='softmax'), 
                                   input_shape=(BATCH_SIZE, SENTENCE_LENGTH_MAX, RNN_HIDDEN_SIZE*2),
                                   name='POS-class')(rnn_outputs)

(100002, 50)


In [62]:
model = Model(inputs=[tokens_input], outputs=[is_ner_outputs])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
SentencesTokens (InputLayer) (None, 32)                0         
_________________________________________________________________
SentencesEmbedded (Embedding (None, 32, 50)            5000100   
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 100)         30300     
_________________________________________________________________
POS-class (TimeDistributed)  (None, None, 2)           202       
Total params: 5,030,602
Trainable params: 30,502
Non-trainable params: 5,000,100
_________________________________________________________________


In [63]:
model.compile(loss='categorical_crossentropy', optimizer="adam")  # , metrics=['accuracy']

In [64]:
#model.fit(x, y_one_hot)
model.fit_generator(batch_for_network_generator(embedding_vector), 1000, epochs=1, verbose=1)

Epoch 1/1
Corpus : Looping
  84/1000 [=>............................] - ETA: 113s - loss: 0.0965Corpus : Looping
 171/1000 [====>.........................] - ETA: 95s - loss: 0.0914Corpus : Looping


<tensorflow.contrib.keras.python.keras.callbacks.History at 0x12c174d30>

In [29]:
weights_file = './data/sentiment_analysis_rnn_trained_keras.h5'
model.save_weights(weights_file)

In [30]:
if os.path.isfile( weights_file ):
    model.load_weights(weights_file)

In [None]:
test_sentences = [('1\tThis is a good sentence. This is another good one.'), ('0\tThis is a very bad awful sentence. Movie was terrible.')]
test_cases = []
for test in test_sentences:
    sentiment,l = test.split('\t')   # Strip of the initial numbers
    for s in sentence_splitter.tokenize(l):  # Split the lines into sentences (~1 each)
        tree_banked = tokenizer.tokenize(s)
        test_cases.append((tree_banked, sentiment))

input_values, target_values_int = sentence_inputs_and_outputs(test_cases, glove_embedding)
rnn_output = model.predict_on_batch(input_values)


In [66]:
corpus_sentence_tokens_gen_test = corpus_sentence_tokens(val_file_name)
next(corpus_sentence_tokens_gen_test)

(['i', 'liked', 'the', 'Da', 'Vinci', 'Code', 'a', 'lot', '.'], '1')

In [68]:
while True:
    test_sentences = [ next(corpus_sentence_tokens_gen_test) for i in range(len(testing)) ]  
    

Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Looping
Corpus : Loopi

KeyboardInterrupt: 

In [69]:
test_sentences

[(['My',
   'dad',
   "'s",
   'being',
   'stupid',
   'about',
   'brokeback',
   'mountain',
   '...'],
  '0'),
 (['Oh',
   ',',
   'and',
   'Brokeback',
   'Mountain',
   'was',
   'a',
   'terrible',
   'movie',
   '.'],
  '0'),
 (['Brokeback', 'Mountain', 'is', 'fucking', 'horrible..'], '0'),
 (['Ok', 'brokeback', 'mountain', 'is', 'such', 'a', 'horrible', 'movie', '.'],
  '0'),
 (['Then',
   'snuck',
   'into',
   'Brokeback',
   'Mountain',
   ',',
   'which',
   'is',
   'the',
   'most',
   'depressing',
   'movie',
   'I',
   'have',
   'ever',
   'seen..'],
  '0'),
 ([',',
   'she',
   'helped',
   'me',
   'bobbypin',
   'my',
   'insanely',
   'cool',
   'hat',
   'to',
   'my',
   'head',
   ',',
   'and',
   'she',
   'laughed',
   'at',
   'my',
   'stupid',
   'brokeback',
   'mountain',
   'cowboy',
   'jokes..'],
  '0'),
 (['Oh',
   ',',
   'and',
   'Brokeback',
   'Mountain',
   'is',
   'a',
   'TERRIBLE',
   'movie',
   '...'],
  '0'),
 (['Ok', 'brokeback', 'mo

In [70]:
input_values, target_values_int = sentence_inputs_and_outputs(test_sentences, glove_embedding)
rnn_output = model.predict_on_batch(input_values)


In [78]:
rnn_output[0][9]

array([ 0.99064094,  0.00935903], dtype=float32)

In [84]:
cutoff = 0.5
y_preds = []
y_truths = []
for i, case in enumerate(rnn_output):
    sent_length = len(test_sentences[i][0]) + 1
    y_true = int(test_sentences[i][1])
    try: 
        y_pred = case[sent_length+1]
    except:
        y_pred = case[SENTENCE_LENGTH_MAX-1]
    y_truths.append(y_true)
    y_preds.append(y_pred)

In [102]:
from sklearn.metrics import roc_auc_score

In [123]:
pd.DataFrame(y_preds).values.argmax(axis=1)

1142

In [126]:
from sklearn.metrics import accuracy_score, classification_report
print(classification_report(pd.Series(y_truths).values, pd.DataFrame(y_preds).values.argmax(axis=1)))

             precision    recall  f1-score   support

          0       0.97      0.94      0.96       603
          1       0.97      0.99      0.98      1122

avg / total       0.97      0.97      0.97      1725



(1725,)