In [None]:
import string
import pandas as pd
import gzip
import pandas as pd
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D, MaxPooling1D, TimeDistributed
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, SpatialDropout1D, Layer, Embedding, Bidirectional, GRU, SpatialDropout2D
from keras.layers import MaxPool1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.layers import concatenate,dot,add,subtract,multiply
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
import re
from keras.models import Sequential
from keras.metrics import top_k_categorical_accuracy
import nltk
nltk.download('stopwords')
from keras.callbacks import EarlyStopping
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from keras import backend as K
from keras import initializers
from keras import regularizers, constraints
from keras.utils import plot_model

# Accessing Google drive and making sure gpu is active

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

# Global variables

In [None]:
MAX_SENTENCE_NUM = 100
MAX_WORD_NUM = 100
MAX_FEATURES = 200000 

MAX_SENT_LENGTH = 200
MAX_SENTS_body = 30
MAX_SENTS_header = 2
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

# Loading Data

In [None]:
path = "drive/My Drive/Stance_detection/"
train_data_body = pd.read_csv(path + 'data/train_bodies.csv')
train_data_stance = pd.read_csv(path + 'data/train_stances.csv')

train_article_id = train_data_body['Body ID']
train_stance_id = train_data_stance['Body ID']

train_article_body = train_data_body['articleBody']
train_labels = train_data_stance['Stance']
train_headlines = train_data_stance['Headline']

test_data_body = pd.read_csv(path + 'data/test_bodies.csv')
test_data_stance = pd.read_csv(path + 'data/test_stances.csv')

test_article_id = test_data_body['Body ID']
test_stance_id = test_data_stance['Body ID']

test_article_body = test_data_body['articleBody']
test_labels = test_data_stance['Stance']
test_headlines = test_data_stance['Headline']

In [None]:
train_df = pd.merge(train_data_stance,train_data_body,on='Body ID',how='inner')
test_df = pd.merge(test_data_stance,test_data_body,on='Body ID',how='inner')

# Text Preprocessing

In [None]:
# Clean String
def cleanString(review,stopWords):
    """
    Cleans input string using set rules.
    Cleaning rules:         Every word is lemmatized and lowercased. Stopwords and non alpha-numeric words are removed.
                            Each sentence ends with a period.
    Input:   review       - string(in sentence structure)
             stopWords    - set of strings which should be removed from review
    Output:  returnString - cleaned input string
             idx_list     - list of lists, one list is equal to one sentence. In every list are the index
                            of each word as they appeared in the non cleaned sentence
                            e.g. nonCleaned = "This is a test." -> cleaned = "This test." -> cleaned_index = [[0,3]]
    """
    # Init the Wordnet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    returnString = ""
    sentence_token = sent_tokenize(review)
    idx_list = []
    for j in range(len(sentence_token)):
        single_sentence = word_tokenize(sentence_token[j])
        sentences_filtered = [(idx,lemmatizer.lemmatize(w.lower())) for idx,w in enumerate(single_sentence)
                              if w.lower() not in stopWords and w.isalnum()]
        idx_list.append([x[0] for x in sentences_filtered])
        word_list = [x[1] for x in sentences_filtered]
        returnString = returnString + ' '.join(word_list) + ' . '
    return returnString, idx_list

# Processing data

In [None]:
# training
bodys = []
labels = []
texts = []
headers = []

# Tokenization
# Word index

"""
Using the keras Tokenizer class a word index is built.
The most 'MAX_FEATURES' used words are tokenized to a number.
this dictionary is saved in word_index
"""
texts = []
for i in range(train_df.shape[0]):
    # Body
    body = train_df['articleBody'].iloc[i]
    body = ' '.join([word.strip(string.punctuation) for word in body.split() if word.strip(string.punctuation) is not ""])
    body, _ = cleanString(body, stopwords.words("english"))
    texts.append(body)
    sentences = sent_tokenize(body)
    bodys.append(sentences)
    
    # Header
    header = train_df['Headline'].iloc[i]
    header = ' '.join([word.strip(string.punctuation) for word in header.split() if word.strip(string.punctuation) is not ""])
    header, _ = cleanString(header, stopwords.words("english"))
    texts.append(header)
    sentences = sent_tokenize(header)
    headers.append(sentences)

    #lables
    labels.append(train_df['Stance'].iloc[i])

# Testing
test_bodys = []
test_labels = []
test_headers = []

# Tokenization
# Word index

"""
Using the keras Tokenizer class a word index is built.
The most 'MAX_FEATURES' used words are tokenized to a number.
this dictionary is saved in word_index
"""
for i in range(test_df.shape[0]):
    # Body
    body = test_df['articleBody'].iloc[i]
    body = ' '.join([word.strip(string.punctuation) for word in body.split() if word.strip(string.punctuation) is not ""])
    body, _ = cleanString(body, stopwords.words("english"))
    texts.append(body)
    sentences = sent_tokenize(body)
    test_bodys.append(sentences)
    
    # Header
    header = test_df['Headline'].iloc[i]
    header = ' '.join([word.strip(string.punctuation) for word in header.split() if word.strip(string.punctuation) is not ""])
    header, _ = cleanString(header, stopwords.words("english"))
    texts.append(header)
    sentences = sent_tokenize(header)
    test_headers.append(sentences)

    #lables
    test_labels.append(test_df['Stance'].iloc[i])
    
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NB_WORDS,lower=True, oov_token=None)
tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index
print('Number of tokens: ' + str(len(word_index)))

# Loading and processing word embedding

In [None]:
# Word Embedding
embeddings_index = dict()
#f = open('drive/My Drive/Google_colab/ADBI_Text_Analysis/src/GloVe/pre-trained/glove/glove.6B.100d.txt')
f = open("drive/My Drive/Google_colab/ADBI_Text_Analysis/src/GloVe/pre-trained/GoogleNews-vectors-negative300.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))


EMBED_SIZE = 300

min_wordCount = 2
absent_words = 0
small_words = 0
#embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
word_counts = tokenizer.word_counts
for word, i in word_index.items():
    if word_counts[word] > min_wordCount:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        else:
            absent_words += 1
    else:
        small_words += 1
print('Total absent words are', absent_words, 'which is', "%0.2f" % (absent_words * 100 / len(word_index)),
      '% of total words')
print('Words with '+str(min_wordCount)+' or less mentions', small_words, 'which is', "%0.2f" % (small_words * 100 / len(word_index)),
      '% of total words')
print(str(len(word_index)-small_words-absent_words) + ' words to proceed.')

# Converting Data for desired format for hierarchical attention network

In [None]:
# preparing data
classes = {'unrelated': 0 , 'agree':1, 'disagree':2, 'discuss':3}
_labels = [classes[t] for t in labels]
data_body = np.zeros((len(bodys), MAX_SENTS_body, MAX_SENT_LENGTH), dtype='int32')
data_header = np.zeros((len(headers), MAX_SENTS_header, MAX_SENT_LENGTH), dtype='int32')

headers
for i, sentences in enumerate(bodys):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS_body:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                    data_body[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

for i, sentences in enumerate(headers):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS_header:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                    data_header[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

_labels = to_categorical(np.asarray(_labels))
print('Shape of header tensor:', data_header.shape)
print('Shape of bofy tensor:', data_body.shape)
print('Shape of label tensor:', _labels.shape)

In [None]:
# preparing test data
classes = {'agree':0,'discuss':1,'unrelated':2,'disagree':3}
_test_labels = [classes[t] for t in test_labels]
test_data_body = np.zeros((len(test_bodys), MAX_SENTS_body, MAX_SENT_LENGTH), dtype='int32')
test_data_header = np.zeros((len(test_headers), MAX_SENTS_header, MAX_SENT_LENGTH), dtype='int32')

headers
for i, sentences in enumerate(test_bodys):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS_body:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                    test_data_body[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

for i, sentences in enumerate(test_headers):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS_header:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                    test_data_header[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

_test_labels = to_categorical(np.asarray(_test_labels))
print('Shape of header tensor:', test_data_header.shape)
print('Shape of bofy tensor:', test_data_body.shape)
print('Shape of label tensor:', _test_labels.shape)

# Custom attention layer

In [None]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    #print("dot: ", x.shape)
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
    

class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self, attention_dim=100,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        
        self.attention_dim = attention_dim
        
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        #print(input_shape[-1])
        assert len(input_shape) == 3

        self.W = self.add_weight((self.attention_dim, input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((self.attention_dim,),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((self.attention_dim,),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        
        #print("here: ", x.shape)
        #print("here: ", self.W.shape)
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

# Model 1

In [None]:
head_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32', name='head_input')
body_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32', name='body_input')
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True,
                            mask_zero=True)
head_embed = embedding_layer(head_input)
body_embed = embedding_layer(body_input)

# Header encoding
head_l_lstm = Bidirectional(GRU(100, return_sequences=True))(head_embed)
head_l_lstm = SpatialDropout1D(0.5)(head_l_lstm)
head_l_att = AttentionWithContext(100)(head_l_lstm)
head_sentEncoder = Model(head_input, head_l_att)

head_review_input = Input(shape=(MAX_SENTS_header, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(head_sentEncoder)(head_review_input)
head_l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
head_l_lstm_sent = SpatialDropout1D(0.5)(head_l_lstm_sent)
head_l_att_sent = AttentionWithContext(100)(head_l_lstm_sent)
head_dense = Dense(100,activation='relu')(head_l_att_sent)

#Body encoding

body_l_lstm = Bidirectional(GRU(100, return_sequences=True))(body_embed)
body_l_lstm = SpatialDropout1D(0.5)(body_l_lstm)
body_l_att = AttentionWithContext(100)(body_l_lstm)
body_sentEncoder = Model(body_input, body_l_att)

body_review_input = Input(shape=(MAX_SENTS_body, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(body_sentEncoder)(body_review_input)
body_l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
body_l_lstm_sent = SpatialDropout1D(0.5)(body_l_lstm_sent)
body_l_att_sent = AttentionWithContext(100)(body_l_lstm_sent)
body_dense = Dense(100,activation='relu')(body_l_att_sent)

# Dot layer
dot_layer = dot([head_dense,body_dense],axes = 1, normalize=True)
conc = concatenate([head_dense,body_dense,dot_layer])
dense = Dense(100,activation='relu')(conc)
dense = Dropout(0.3)(dense)
dense = Dense(4,activation='softmax')(dense)
model = Model(inputs=[head_review_input,body_review_input], outputs=[dense])
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Model 2

In [None]:
head_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32', name='head_input')
body_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32', name='body_input')
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=False,
                            mask_zero=True)
head_embed = embedding_layer(head_input)
body_embed = embedding_layer(body_input)

# Header encoding
head_l_lstm = Bidirectional(LSTM(100, return_sequences=True))(head_embed)
head_l_lstm = SpatialDropout1D(0.3)(head_l_lstm)
head_l_att = AttentionWithContext(100)(head_l_lstm)
head_sentEncoder = Model(head_input, head_l_att)

head_review_input = Input(shape=(MAX_SENTS_header, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(head_sentEncoder)(head_review_input)
head_l_lstm_sent = Bidirectional(LSTM(100, return_sequences=True))(review_encoder)
head_l_lstm_sent = SpatialDropout1D(0.3)(head_l_lstm_sent)
head_l_att_sent = AttentionWithContext(100)(head_l_lstm_sent)
#head_dense = Dense(100,activation='relu')(head_l_att_sent)

#Body encoding

body_l_lstm = Bidirectional(LSTM(100, return_sequences=True))(body_embed)
body_l_lstm = SpatialDropout1D(0.3)(body_l_lstm)
body_l_att = AttentionWithContext(100)(body_l_lstm)
body_sentEncoder = Model(body_input, body_l_att)

body_review_input = Input(shape=(MAX_SENTS_body, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(body_sentEncoder)(body_review_input)
body_l_lstm_sent = Bidirectional(LSTM(100, return_sequences=True))(review_encoder)
body_l_lstm_sent = SpatialDropout1D(0.3)(body_l_lstm_sent)
body_l_att_sent = AttentionWithContext(100)(body_l_lstm_sent)
#body_dense = Dense(100,activation='relu')(body_l_att_sent)

# Dot layer
dot_layer = dot([head_l_att_sent,body_l_att_sent],axes = 1, normalize=True)
conc = concatenate([head_l_att_sent,body_l_att_sent,dot_layer])
dense = Dense(100,activation='relu')(conc)
dense = Dropout(0.3)(dense)
dense = Dense(4,activation='softmax')(dense)
model = Model(inputs=[head_review_input,body_review_input], outputs=[dense])
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Model fitting

In [None]:
history = model.fit([data_header,data_body], _labels, validation_split=0.2, nb_epoch=10, batch_size=128)
model.save('drive/My Drive/Stance_detection/model/HAN_simple_lstm.h5')

# Evaluate Model

In [None]:
model.evaluate([test_data_header,test_data_body],[_test_labels])