# Stance Detection using Glove Word Embeddings

## Load Data

In [1]:
PRELOAD = True
preload_path = "data_dump_glove.data"

In [2]:
if PRELOAD == True:
    import pickle
    data = pickle.load(open(preload_path, "rb"))
    data_train = data["X_train"]
    labels_train = data["Y_train"]
    data_test = data["X_test"]

In [3]:
if PRELOAD == False:
    
    import pandas as pd
    import numpy as np

    dtypes_train = {"id":np.int64, "text":str, "author":str, "title":str, "label":np.int64}
    dtypes_test = {"id":np.int64, "text":str, "author":str, "title":str}

    SEED = 1234

    # load data

    train_df = pd.read_csv("data/train.csv", dtype=dtypes_train)
    train_df = train_df.dropna()
#     train_df = train_df.sample(frac=1)
    X_train = train_df.drop(['label', 'id', 'author'], axis=1).values
    Y_train = train_df['label'].values
    print("Train DIMS \nX dims: {} Y dims: {}".format(X_train.shape, Y_train.shape))

    test_df = pd.read_csv("data/test.csv")
    test_df = test_df.dropna()
    X_test = test_df.drop(['id', 'author'], axis=1).values
    print("Test DIMS \nX dims: {}".format(X_test.shape))
    print("Num Labels: ", np.unique(Y_train))
    
    import pickle
    from nltk.corpus import stopwords
    import re
    
    # preprocessing
    stop_words = set(stopwords.words("english"))

    train_data = []
    test_data = []
    

    for i in range(len(X_train)):
        headline, article = X_train[i]
        
        headline = re.sub(r'[^\w\s]|\n|\r','',headline)
        article = re.sub(r'[^\w\s]|\n|\r','',article)
        
        headline = headline.lower().split(" ")
        article = article.lower().split(" ")
        
        headline = [word for word in headline if word not in stop_words]
        article = [word for word in article if word not in stop_words]
        
        headline = " ".join(headline)
        article = " ".join(article)
        
        train_data.append([ headline, article])

    for i in range(len(X_test)):
        headline, article = X_test[i]
        
        headline = re.sub(r'[^\w\s]|\n|\r','',headline)
        article = re.sub(r'[^\w\s]|\n|\r','',article)
        
        headline = headline.lower().split(" ")
        article = article.lower().split(" ")
        
        headline = [word for word in headline if word not in stop_words]
        article = [word for word in article if word not in stop_words]
        
        headline = " ".join(headline)
        article = " ".join(article)
        
        test_data.append([ headline, article])


    data_save = {
            "X_train": train_data, 
            "Y_train": Y_train,
            "X_test": test_data,
    }
    pickle.dump(data_save, open("data_dump_glove.data", "wb"))
    print("Saved pre-preocessed data as : {}".format("data_dump_glove.data"))

## Prepare Data

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

Using TensorFlow backend.


path :  c:\users\joavi\anaconda3\envs\py36\lib\site-packages\tensorflow\python\_pywrap_tensorflow_internal.pyd 
name :  _pywrap_tensorflow_internal


In [5]:
print("100th sample\n")
print("headline : {}\n".format(data_train[100][0]))
print("article : {}\n".format(data_train[100][1]))
print("label : {}\n".format(labels_train[100]))

100th sample

headline : snap shares leap 44 debut investors doubt value vanish  new york times

article : snapchat business built large part disappearing messages adding animated dog ears flower crowns users selfies thursday business worth 34 billion     market value   media company cbs three times size another social media company twitter snapchat made paper billionaires   founders five times making stock market debut spectacular fashion     shares rising 44 percent first day trading     snapchats parent snap inc blazed trail technology darlings like uber spotify remain privately held elated wall street institutions eager prominent initial public offering surfaced months company entranced investors despite litany red flags like enormous losses expected persist years slowdown   user growth rates ownership structure gives snapchats founders control decades come shadows onetime tech highfliers since crashed earth twitter valued nearly 32 billion end first day trading wall street values 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data_train, labels_train, test_size=0.2, random_state=101)

In [7]:
X_train_headlines = [i[0] for i in X_train]
X_train_articles = [i[1] for i in X_train]
X_test_headlines = [i[0] for i in X_test]
X_test_articles = [i[1] for i in X_test]

In [8]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
# fit headline
tokenizer.fit_on_texts(X_train_headlines + X_train_articles)

In [9]:
X_train_headlines_sequence = tokenizer.texts_to_sequences(X_train_headlines)
X_train_articles_sequence = tokenizer.texts_to_sequences(X_train_articles)

X_test_headlines_sequence = tokenizer.texts_to_sequences(X_test_headlines)
X_test_articles_sequence = tokenizer.texts_to_sequences(X_test_articles)

In [10]:
# padding
max_words_headline = 70
max_words_article = 1000

X_train_headlines_sequence = sequence.pad_sequences(X_train_headlines_sequence, maxlen=max_words_headline)
X_train_articles_sequence = sequence.pad_sequences(X_train_articles_sequence, maxlen=max_words_article)

X_test_headlines_sequence = sequence.pad_sequences(X_test_headlines_sequence, maxlen=max_words_headline)
X_test_articles_sequence = sequence.pad_sequences(X_test_articles_sequence, maxlen=max_words_article)

## Load the Embeddings

In [11]:
import numpy as np
from tqdm import tqdm

In [12]:
embeddings_path = 'glove.twitter.27B.100d.txt'

embeddings_index = dict()
f = open(embeddings_path, encoding="utf8")
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

1193514it [00:43, 27161.24it/s]


In [13]:
max_word_vectors = 25000
embed_dim = 100

In [14]:
all_embs = np.stack(embeddings_index.values())
embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), 
                                        (max_word_vectors, embed_dim))

In [15]:
for word, i in tokenizer.word_index.items():
        if i >= max_word_vectors:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

## Build Model

In [16]:
from keras.models import Input, Model
from keras.layers import Embedding, LSTM, concatenate, Dense
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

In [17]:
article_input = Input(shape=(max_words_article,))
article_emedd = Embedding(max_word_vectors, embed_dim, input_length=max_words_article,
                       weights=[embedding_matrix], trainable=False)(article_input)
article_lstm = LSTM(100, dropout=0.2, recurrent_dropout=0.2)(article_emedd)
article_dense = Dense(100, activation='relu')(article_lstm)


headline_input = Input(shape=(max_words_headline,))
headline_emedd = Embedding(max_word_vectors, embed_dim, input_length=max_words_headline,
                       weights=[embedding_matrix], trainable=False)(headline_input)
headline_lstm = LSTM(100, dropout=0.2, recurrent_dropout=0.2)(headline_emedd)
headline_dense = Dense(100, activation='relu')(headline_lstm)

concat_dense = concatenate(inputs=[headline_dense, article_dense])
output = Dense(1, activation='sigmoid')(concat_dense)

model = Model(inputs=[headline_input, article_input], outputs=[output])

In [18]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 70)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 70, 100)      2500000     input_2[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1000, 100)    2500000     input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LS

In [25]:
model.fit(
            x=[X_train_headlines_sequence, X_train_articles_sequence], 
            y=y_train,
            validation_split=0.2,
            callbacks=[
                ModelCheckpoint(filepath="glove100d.hdf5", monitor='val_loss', save_best_only=True),
                ReduceLROnPlateau(patience=1)
            ],
            verbose=1,
            shuffle=True,
            batch_size=512,
            epochs=20)

Train on 11702 samples, validate on 2926 samples
Epoch 1/1


<keras.callbacks.History at 0x195644ebd30>