In [None]:
import numpy as np
import pandas as pd
import _pickle as cPickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import numpy as np
import sys
import os

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping,ReduceLROnPlateau, ModelCheckpoint, TensorBoard

from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPool1D, merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed, Dense, Input, Flatten, CuDNNGRU,CuDNNLSTM, concatenate, Lambda 

from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

from Attention import AttentionLayer
import nltk
#nltk.download('punkt')

import matplotlib.pyplot as plt

from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score


### Model parameters 

In [None]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 100
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 100
VALIDATION_SPLIT,TEST_SPLIT = 0.2 , 0.05
DROP_RATE=0.45
ATTENTION_TYPE='local' #{'self','global','local'}
GPU=False

### Cleaning the text, tokenization and creating training, validation and test tensors

In [None]:
def clean_text(string):
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()


data_train = pd.read_csv('labeledTrainData.tsv', sep='\t')

from nltk import tokenize

reviews = []
labels = []
texts = []

for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx])
    text = clean_text(text.get_text())
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)

    labels.append(data_train.sentiment[idx])

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)


# creating and filling the 3D tensor with word_index numbers:

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

word_index = tokenizer.word_index

labels = to_categorical(np.asarray(labels))
print('X tensor shape:', data.shape)
print('Y tensor shape:', labels.shape)


# suffle data and split train, validation and test sets:

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

nb_val = int(VALIDATION_SPLIT* data.shape[0])
nb_test = int(TEST_SPLIT* data.shape[0])

x_train = data[:-(nb_val+nb_test)]
y_train = labels[:-(nb_val+nb_test)]
x_val = data[-(nb_val+nb_test):-nb_test]
y_val = labels[-(nb_val+nb_test):-nb_test]
x_test = data[-nb_test:]
y_test = labels[-nb_test:]


### Embedding matrix 

In [None]:
# turning the glove.txt to dictionary: 

embeddings_index = {}
f = open(os.path.join('glove.6B.100d.txt'),encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


# using the embedding_index dictionary and word_index to create embeding_matrix:

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


### Creating hidden layers, applying attention to sentence and document levels

In [None]:
#SENTENCE LEVEL
sent_ints = Input(shape=(None,))
sent_wv = Embedding(embedding_matrix.shape[0],
                    embedding_matrix.shape[1],
                    weights=[embedding_matrix],
                    input_length=MAX_SENT_LENGTH, 
                    trainable=False
                    )(sent_ints)

sent_wv_dr = Dropout(DROP_RATE)(sent_wv)

if GPU : sent_wa = Bidirectional(CuDNNGRU(units=50,return_sequences=True),merge_mode='concat',weights=None)(sent_wv_dr)
else : sent_wa = Bidirectional(GRU(units=50,return_sequences=True),merge_mode='concat',weights=None)(sent_wv_dr)

# attention vector for the sentence:
sent_att_vec,sent_att_coeffs = AttentionLayer(return_coefficients=True, attention_type=ATTENTION_TYPE)(sent_wa) 
sent_att_vec_dr = Dropout(DROP_RATE)(sent_att_vec)                      
sent_encoder = Model(sent_ints,sent_att_vec_dr)

print(np.shape(sent_encoder))
print(sent_wa.shape)
print(sent_att_vec_dr.shape)


#DOCUMENT LEVEL
doc_ints = Input(shape=(None,None,))        
sent_att_vecs_dr = TimeDistributed(sent_encoder)(doc_ints)

if GPU : doc_sa = Bidirectional(CuDNNGRU(units=50,return_sequences=True),merge_mode='concat',weights=None)(sent_att_vecs_dr)
else : doc_sa = Bidirectional(GRU(units=50,return_sequences=True),merge_mode='concat',weights=None)(sent_att_vecs_dr)

# attention vector for the document:
doc_att_vec,doc_att_coeffs = AttentionLayer(return_coefficients=True, attention_type=ATTENTION_TYPE)(doc_sa)
doc_att_vec_dr = Dropout(DROP_RATE)(doc_att_vec)


### Creating the output layer and assembling the NN model 

In [None]:
n_cats=np.shape(y_train)[1]
preds = Dense(units=n_cats, activation='softmax')(doc_att_vec_dr)

Classifier = Model(doc_ints,preds)

Classifier.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])


### Training the model

In [None]:
history = Classifier.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=20, batch_size=50)

### Plots and metrics 

In [None]:
train_acc=history.history['acc']
val_acc=history.history['val_acc']
train_loss=history.history['loss']
val_loss=history.history['val_loss']

epcs= range (1, len(train_acc)+1)


fig, (ax1, ax2) = plt.subplots(1,2)

ax1.plot(epcs, train_acc, color='b',label="Training")
ax1.plot(epcs, val_acc, color='r',label="Validation")

ax1.set_facecolor('lightyellow')
ax1.grid(True)
ax1.set_xlabel('Epoch')
ax1.set_title('Accuracy')
ax1.legend()

ax2.set_facecolor('lightcyan')
ax2.plot(epcs, train_loss, color='b',label="Ttraining")
ax2.plot(epcs, val_loss, color='r',label="Validation")
ax2.grid(True)
ax2.set_xlabel('Epoch')
ax2.set_title('Loss')
ax2.legend()

plt.show()

In [None]:
prediction = Classifier.predict(x_test)

def transfer2cat (m):
    mm=[]
    for i in m.tolist():
        mm.append(i.index(max(i))+1)
    return mm

prediction=transfer2cat(prediction)
y_test=transfer2cat(np.array(y_test))


print ('Accuracy:', accuracy_score(y_test, prediction))
print ('F1 score:', f1_score(y_test, prediction, average='weighted'))
print ('Recall:', recall_score(y_test, prediction, average='weighted'))
print ('Precision:', precision_score(y_test, prediction, average='weighted'))
print ('\n clasification report:\n', classification_report(y_test,prediction))
print ('\n confussion matrix:\n',confusion_matrix(y_test, prediction))
