In [16]:
import pandas as pd
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation, Flatten, LSTM, Bidirectional


In [None]:
# These are some hyperparameters that can be tuned
MAX_SENT_LEN = 200
MAX_VOCAB_SIZE = 100000
EMBEDDING_DIM = 100
BATCH_SIZE = 1000
N_EPOCHS = 20
DROPOUT = 0.0001
L2 = 1e-4

In [None]:
train = pd.read_csv('../fnc-1-baseline-master/data/train.csv')
test = pd.read_csv('../fnc-1-baseline-master/data/test.csv')

In [80]:
train_labels = to_categorical(np.array(train['Stance']))
test_labels = to_categorical(np.array(test['Stance']))

In [81]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(train['combinedText'])
train_sequence = tokenizer.texts_to_sequences(train['combinedText'])
test_sequence = tokenizer.texts_to_sequences(test['combinedText'])

In [82]:
train_pad = pad_sequences(train_sequence, maxlen=MAX_SENT_LEN, padding='post', truncating='post')
test_pad = pad_sequences(test_sequence, maxlen=MAX_SENT_LEN, padding='post', truncating='post')

# pad sequences manually
# def preprocessing(headlines, bodies, stances, tokenizer):
#     # Convert the sequence of words to sequnce of indices
#     X = tokenizer.texts_to_sequences([' '.join((headline + "<>" + body)[:MAX_SENT_LEN]) for headline in headlines for body in bodies])
#     X = pad_sequences(X, maxlen=MAX_SENT_LEN, padding='post', truncating='post')
#     return X, y

# X_train, y_train = preprocessing(headlines, bodies, stance, tokenizer)

In [61]:
in_file = '../fnc-1-baseline-master/data/glove_wiki/glove.6B.100d.txt'
out_file = '../fnc-1-baseline-master/data/glove_wiki/glove.6B.100d.word2vec.txt'

glove2word2vec(in_file, out_file)
w2v = KeyedVectors.load_word2vec_format(out_file, binary=False)


  after removing the cwd from sys.path.


In [34]:
# inp = '../fnc-1-baseline-master/data/glove_twitter/glove.27B.200d.txt'
# outp = '../fnc-1-baseline-master/data/glove_twitter/glove.27B.200d.word2vec.txt'

# glove2word2vec(inp, outp)
# w2v_twitter = KeyedVectors.load_word2vec_format(outp, binary=False)


In [83]:
vocab = tokenizer.word_index.keys()
# Add one because index 0 is reserved and isn't assigned to any word
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
embedding_matrix = np.zeros((len(vocab)+1, EMBEDDING_DIM))

embedding_matrix[0] = np.random.random((1, EMBEDDING_DIM))
for i, word in enumerate(vocab, 1):
    try:
        embedding_matrix[i] = w2v[word]
    except KeyError as e:
        embedding_matrix[i] = np.random.random((1, EMBEDDING_DIM))
        
# from a3
# embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, EMBEDDING_DIM))   
# for word, i in tokenizer.word_index.items(): # i=0 is the embedding for the zero padding
#     try:
#         embeddings_vector = word_embeddings[word]
#     except KeyError:
#         embeddings_vector = None
#     if embeddings_vector is not None:
#         embeddings_matrix[i] = embeddings_vector


In [84]:
train_pad, val_pad, train_labels, val_labels = train_test_split(train_pad, train_labels, random_state = 42, test_size = 0.15)


In [45]:
len(val_labels)

6372

In [85]:
keras.backend.clear_session()

# LSTM
model = Sequential()

model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                    output_dim=EMBEDDING_DIM, 
                    input_length = MAX_SENT_LEN,
                    weights = [embedding_matrix], 
                    trainable=False, 
                    name='word_embedding_layer',
                    mask_zero=True))

model.add(Bidirectional(LSTM(120, return_sequences = False)))
model.add(Flatten())
model.add(Dropout(DROPOUT))

model.add(Dense(4, activation = 'softmax', name='softmax_output_layer'))

# model.add(Dense(2, activation = 'softmax', name='softmax_output_layer', activity_regularizer=l2(L2)))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 word_embedding_layer (Embed  (None, 30, 100)          2787600   
 ding)                                                           
                                                                 
 bidirectional (Bidirectiona  (None, 240)              212160    
 l)                                                              
                                                                 
 flatten (Flatten)           (None, 240)               0         
                                                                 
 dropout (Dropout)           (None, 240)               0         
                                                                 
 softmax_output_layer (Dense  (None, 4)                964       
 )                                                               
                                                        

In [86]:
fit = model.fit(train_pad, train_labels, epochs=N_EPOCHS, batch_size=BATCH_SIZE, validation_data=(val_pad, val_labels))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [87]:
 _, accuracy = model.evaluate(test_pad, test_labels, batch_size=BATCH_SIZE)
print("Test Set Accuracy = {:.4f}".format(accuracy))

Test Set Accuracy = 0.7074


In [None]:
### Glove 200: 65%
### Twitter 200: 62%

### Max sentence length = 100
### Twitter 200: 62%, VAL ACCURACY ~87%
### Glove 200: 70%, val accuracy 82-85%

### Max vocab size
### Glove 200: 69.38%, val accuracy 82-88%
### Glove 100: 70%

### Bidirectional LSTM
### Glove 100: 72.8%, val accuracy 89-94%

### Max sentence length = 50
### Glove 100: 72.8%

### Batch Size = 2000 (previously 250)
### Glove 100: 73.46%

### Batch size = 500
### Glove 100: 73.9

### Batch size = 100
### Glove 100: 73.5%%

### Epochs = 20
### Glove 100: 73.9%

### Max sentences = 30
### Glove 100: 70.7%
