# Set up

In [1]:
import pandas as pd
import numpy as np
import json
import pickle
import os

METADATA_FILEPATH = '../dataset/metadata.json'

Source: https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456

# Load training set

In [2]:
with open(METADATA_FILEPATH, 'r') as f:
    claims = json.load(f)

In [3]:
df = pd.DataFrame(claims)

In [4]:
df.head(10)

Unnamed: 0,claim,claimant,date,label,related_articles,id
0,A line from George Orwell's novel 1984 predict...,,2017-07-17,0,"[122094, 122580, 130685, 134765]",0
1,Maine legislature candidate Leslie Gibson insu...,,2018-03-17,2,"[106868, 127320, 128060]",1
2,A 17-year-old girl named Alyssa Carson is bein...,,2018-07-18,1,"[132130, 132132, 149722]",4
3,In 1988 author Roald Dahl penned an open lette...,,2019-02-04,2,"[123254, 123418, 127464]",5
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6
5,"Rhode Island is ""almost dead last"" among North...",Leonidas Raptakis,2014-02-11,2,"[8284, 3768, 20091, 82368, 73148, 4493]",7
6,The poorest counties in the U.S. are in Appala...,Jim Webb,2014-11-19,1,"[70709, 70708]",8
7,Koch Industries paid the legal fees of George ...,,2013-07-18,0,"[120591, 120592, 127866, 129483]",9
8,"""Minnesota, Michigan, Iowa already have 70 mph...",Robin Vos,2013-08-22,1,"[69547, 80095, 7994, 81116, 77621]",11
9,"""FBI Uniform Crime Report for 2016 shows more ...",Nick Schroer,2017-10-17,1,"[72012, 26005, 43481, 55671]",12


In [5]:
len(df)

15555

In [6]:
# X_train = df.loc[:12444, 'claim'].values.tolist()
# y_train = df.loc[:12444, 'label'].values.tolist()
# X_test = df.loc[12445:, 'claim'].values.tolist()
# y_test = df.loc[12445:, 'label'].values.tolist()

In [7]:
# X_train

In [8]:
# y_train

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [10]:
# tokenizer_obj = Tokenizer()

In [11]:
# total_claims = np.concatenate((X_train, X_test))

In [12]:
# tokenizer_obj.fit_on_texts(total_claims)

In [13]:
# max_length = max([len(s.split()) for s in total_claims])

In [14]:
# max_length

In [15]:
# vocab_size = len(tokenizer_obj.word_index) + 1

In [16]:
# vocab_size

In [17]:
import string
import gensim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

EMBEDDING_DIM = 100
max_length = 150

In [18]:
claim_lines = list()
lines = df['claim'].values.tolist()

In [19]:
for line in lines:
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    claim_lines.append(tokens)

In [20]:
len(claim_lines)

15555

In [21]:
# train word2vec model
model = gensim.models.Word2Vec(sentences=claim_lines, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)

In [22]:
# Vocab size
words = list(model.wv.vocab)
print("Vocabularly size: %d" % len (words))

Vocabularly size: 24086


In [24]:
model.wv.most_similar('trump')

[('insulting', 0.8759750723838806),
 ('obama', 0.8573005795478821),
 ('notation', 0.8439675569534302),
 ('analytics', 0.8419066071510315),
 ('vice', 0.8057959079742432),
 ('assad', 0.7952134609222412),
 ('emmy', 0.7938637733459473),
 ('limousine', 0.7933038473129272),
 ('boycotting', 0.782264232635498),
 ('zuma', 0.7754181027412415)]

In [25]:
model.wv.most_similar('president')

[('barack', 0.9336286187171936),
 ('donald', 0.928165078163147),
 ('administration', 0.9179438352584839),
 ('michelle', 0.9096911549568176),
 ('melania', 0.8953635096549988),
 ('wiretapping', 0.8885258436203003),
 ('presidency', 0.8832364082336426),
 ('ivanka', 0.8801191449165344),
 ('cleared', 0.8706070184707642),
 ('blurted', 0.8679291605949402)]

In [26]:
model.wv.most_similar('election')

[('midterm', 0.9479297399520874),
 ('recount', 0.9437222480773926),
 ('2012', 0.9411381483078003),
 ('cycle', 0.9408465623855591),
 ('2008', 0.9364829063415527),
 ('rotc', 0.9358514547348022),
 ('collapsed', 0.9320218563079834),
 ('monday', 0.9263314604759216),
 ('norfolk', 0.9243888854980469),
 ('canals', 0.9207191467285156)]

In [28]:
# Save model
filename = 'claims_embedding_word2vec_clean.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [29]:
embeddings_index = {}
with open("claims_embedding_word2vec_clean.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:])
        embeddings_index[word] = coefs

In [30]:
# Vectorize the text samples into a 2D integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(claim_lines)

In [31]:
sequences = tokenizer_obj.texts_to_sequences(claim_lines)

In [32]:
# Pad sequences
word_index = tokenizer_obj.word_index
print("Found %s unique tokens." % len(word_index))

Found 24086 unique tokens.


In [33]:
claim_pad = pad_sequences(sequences, maxlen=max_length)

In [34]:
label = df['label'].values

In [35]:
print("Shape of claim tensor:", claim_pad.shape)

Shape of claim tensor: (15555, 150)


In [36]:
print("Shape of label tensor:", label.shape)

Shape of label tensor: (15555,)


In [37]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

In [38]:
for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros
        embedding_matrix[i] = embedding_vector

In [39]:
print(num_words)

24087


In [40]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

In [49]:
# Define model
model = Sequential()
embedding_layer = Embedding(num_words,
                           EMBEDDING_DIM,
                           embeddings_initializer=Constant(embedding_matrix),
                           input_length=max_length,
                           trainable=False)

In [50]:
model.add(embedding_layer)
model.add(GRU(units=32, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(1, activation='relu'))

In [51]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [52]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 150, 100)          2408700   
_________________________________________________________________
gru_2 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 2,421,501
Trainable params: 12,801
Non-trainable params: 2,408,700
_________________________________________________________________


In [53]:
# Split the data into a training set and a validation set
VALIDATION_SPLIT = 0.2

indices = np.arange(claim_pad.shape[0])
np.random.shuffle(indices)
claim_pad = claim_pad[indices]
label = label[indices]
num_validation_samples = int(VALIDATION_SPLIT * claim_pad.shape[0])

In [54]:
X_train_pad = claim_pad[:-num_validation_samples]
y_train = label[:-num_validation_samples]
X_test_pad = claim_pad[-num_validation_samples:]
y_test = label[-num_validation_samples:]

In [55]:
print("Shape of X_train_pad tensor:", X_train_pad.shape)
print("Shape of y_train tensor:", y_train.shape)

print("Shape of X_test_pad tensor:", X_test_pad.shape)
print("Shape of y_test tensor:", y_test.shape)

Shape of X_train_pad tensor: (12444, 150)
Shape of y_train tensor: (12444,)
Shape of X_test_pad tensor: (3111, 150)
Shape of y_test tensor: (3111,)


In [56]:
model.fit(X_train_pad, y_train, batch_size=50, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Training...
Train on 12444 samples, validate on 3111 samples
Epoch 1/25
 - 43s - loss: 2.1674 - accuracy: 0.5029 - val_loss: 1.0397 - val_accuracy: 0.4738
Epoch 2/25
 - 39s - loss: 1.5069 - accuracy: 0.4782 - val_loss: 1.0231 - val_accuracy: 0.4796
Epoch 3/25
 - 40s - loss: 0.9891 - accuracy: 0.4819 - val_loss: 0.6510 - val_accuracy: 0.4941
Epoch 4/25
 - 40s - loss: 0.7797 - accuracy: 0.4781 - val_loss: 0.6208 - val_accuracy: 0.4632
Epoch 5/25
 - 38s - loss: 0.9777 - accuracy: 0.5033 - val_loss: 1.0217 - val_accuracy: 0.4754
Epoch 6/25
 - 38s - loss: 1.3523 - accuracy: 0.4928 - val_loss: 2.4611 - val_accuracy: 0.4458
Epoch 7/25
 - 38s - loss: 0.9181 - accuracy: 0.4823 - val_loss: 0.7748 - val_accuracy: 0.4616
Epoch 8/25
 - 37s - loss: 0.8325 - accuracy: 0.4743 - val_loss: 0.7741 - val_accuracy: 0.4648
Epoch 9/25
 - 38s - loss: 0.7597 - accuracy: 0.4869 - val_loss: 0.6383 - val_accuracy: 0.4834
Epoch 10/25
 - 37s - loss: 0.6509 - accuracy: 0.4803 - val_loss: 0.6193 - val_accuracy: 0.480

<keras.callbacks.callbacks.History at 0x7f5bca138eb8>