In [46]:
import pandas as pd
import numpy as np
from os import path
#import tensorboard
from keras.callbacks import TensorBoard
#for plotting
import matplotlib.pyplot as plt

In [2]:
#Constants for running the network
MAX_NUM_WORDS = 20000 #only consider top 200 
EMBEDDING_DIM = 200
TWEET_LENGTH = 140 #max length of sequence
VALIDATION_SPLIT = 0.20 #validation to split, 80% testing 20% validation
MODEL_TYPE_ID = ''
MASTER_DIR = '../../'
MODELS_DIR = path.join(MASTER_DIR,'Models/')

In [3]:
#loading dataset for the analysis
dataset = pd.read_csv('../../Sentiment Analysis Data/input/Processed_Sentiment Analysis Dataset 2.csv',encoding='utf-8')

In [4]:
#loading tweets and creating target labels
tweets = dataset.SentimentText.tolist() # tweets
labels = dataset.Sentiment.tolist() # target labels
labels_dict = {1:"Positive",0:"Negative"} #mapping to labels to meaning

In [5]:
print('found %s number of tweets' %len(tweets))
print('found %s number of labels' %len(labels))
print('first 5 tweets from the dataset\n','\n'.join(tweets[:5]))

found 1578614 number of tweets
found 1578614 number of labels
first 5 tweets from the dataset
 is so sad for my APL friend.............
I missed the New Moon trailer...
omg its already 7:30 :O
.. Omgaga. Im sooo im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...
i think mi bf is cheating on me!!! T_T


In [6]:
#import utilities for pre-processing the tweets to make them ready for RNN
from keras.preprocessing.text import Tokenizer # required for tokenizing the tweets i.e., breaking it into word array
from keras.preprocessing.sequence import pad_sequences # required for padding the tweets to be of a particular length

In [None]:
for index,tweet in enumerate(tweets):
    if not isinstance(tweet,str):
        print(index,tweet)

In [7]:
#only consider top 200 words in the dataset(by frequency)
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(tweets)

In [8]:
#convert each tweet to a sequences and therefore converting the dataset of tweets to a list of sequences 
sequences = tokenizer.texts_to_sequences(tweets)
#creating padded sequences of length_tweet
data = pad_sequences(sequences, maxlen=TWEET_LENGTH)

In [9]:
#retrieve the word index i.e., {'<word>':'<index>'} of the current dataset
word_index = tokenizer.word_index
print('total number of unique words found %s' %len(word_index))

total number of unique words found 635095


In [10]:
#Converts a class vector (integers) labels to binary class matrix.
from keras.utils import to_categorical
labels = to_categorical(np.asarray(labels))

print('the shape of data tensor:',data.shape)
print('the shape of label tensor:',labels.shape)

the shape of data tensor: (1578614, 140)
the shape of label tensor: (1578614, 2)


In [11]:
#splitting the data into training and testing dataset
indices = np.arange(data.shape[0])
np.random.shuffle(indices) #randomly move the indices
data = data[indices] #sync with random shuffle in the previous step
labels = labels[indices] #sync with the shuffle

#determine the spliting point
split_point = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-split_point]
y_train = labels[:-split_point]
x_test = data[-split_point:]
y_test = labels[-split_point:]

<h1> Preparing the embedded layer </h1>

In [12]:
#create dictionary of words and their vectors using the GloVe embeddings
embedding_index = {}
with open(path.join(MASTER_DIR,'Word Embedding/glove.twitter.27B.200d.txt'),encoding='utf-8') as glove_embedding:
    for line in glove_embedding:
        emb_data = line.split()
        word = emb_data[0]
        coefs = np.asarray(emb_data[1:],dtype='float32')
        embedding_index[word] = coefs
        
print('total vocab found',len(embedding_index))

total vocab found 1193514


In [13]:
#show top five words from word_index
i = 0 
for key,value in word_index.items():
    if i < 5:
        print('word:',key,'rank:',value)
        i+=1
    else:
        break

word: i rank: 1
word: to rank: 2
word: the rank: 3
word: a rank: 4
word: my rank: 5


In [24]:
nb_words = min(MAX_NUM_WORDS,len(word_index))
#use embeddding_index and word_index to compute embedding_matrix
embedding_matrix = np.zeros((nb_words,EMBEDDING_DIM))
word_not_in_embedding = []
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        #words not found in embedding_index will be all zeros
        embedding_matrix[i] = embedding_vector
    else:
        #create a list of words not in embedding to statistical analysis
        word_not_in_embedding.append((word,i))

In [25]:
nb_missed_words = len(word_not_in_embedding)
nb_found_words = len(word_index) - nb_missed_words
print('%s number of words found and %s number of words missed'%(nb_found_words,nb_missed_words))

632515 number of words found and 2580 number of words missed


<h1> Create Embedding Layer for the Network</h1>

In [26]:
from keras.layers import Embedding
#set trainable to false as the embedding layer should not be trained during back propogation i.e., training
embedding_layer = Embedding(nb_words,EMBEDDING_DIM,trainable=False,
                            weights=[embedding_matrix],input_length = TWEET_LENGTH,name='embedding_layer_1')

<h2>Training a 1D Convolution Net a.k.a 1D ConvNet</h2>

In [34]:
from keras import Input, Sequential, regularizers, optimizers
from keras.layers import Conv1D, MaxPooling1D,Dense,GlobalMaxPooling1D,Dropout

#training params
default_batch_size = 256 
num_epochs = 8 

#model parameters
num_filters = 64
embed_dim = 300 
weight_decay = 1e-4

MODEL_TYPE_ID = '1D_Conv_Net'
embedding_layer_map = {'embedding_layer_1':path.join(MASTER_DIR,'Word Embedding/glove.twitter.27B.200d.txt')}

model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(len(labels_dict), activation='softmax'))  #

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer_1 (Embedding (None, 140, 200)          4000000   
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 140, 64)           89664     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 70, 64)            0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 70, 64)            28736     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 64)                0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                2080      
__________

<h5> Start Training </h5>

In [43]:
tb = TensorBoard(log_dir=path.join(MODELS_DIR,MODEL_TYPE_ID), histogram_freq=num_epochs,
                 write_grads=True, write_images=True, batch_size = default_batch_size, embeddings_freq=default_batch_size, 
                 embeddings_metadata=None)
hist = model.fit(x_train, y_train, validation_data=(x_test, y_test),
          epochs=num_epochs, batch_size=default_batch_size,callbacks=[tb])

Train on 1262892 samples, validate on 315722 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x200264545f8>

In [44]:
model.save('./1d_ConvNet.h5')

In [50]:
%matplotlib inline

In [None]:
plt.figure()
plt.plot(hist.history['acc'], lw=2.0, color='b', label='train')
plt.plot(hist.history['val_acc'], lw=2.0, color='r', label='val')
plt.title('CNN sentiment')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='upper left')
plt.show()

In [60]:
trail = ['please die!']
sequences = tokenizer.texts_to_sequences(trail)
dummy = pad_sequences(sequences, maxlen=TWEET_LENGTH)
dummy

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0, 176, 863]])

In [66]:
result = model.predict(dummy,verbose=1)[0]
print(result,np.argmax(result))
print(trail[0],'prediction->',labels_dict[np.argmax(result)])

[0.8296     0.17040004] 0
please die! prediction-> Negative
