# Set up

In [1]:
import pandas as pd
import numpy as np
import json
import pickle
import os

METADATA_FILEPATH = '../dataset/metadata.json'

Sources: 

https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456

# Load training set

In [2]:
with open(METADATA_FILEPATH, 'r') as f:
    metadata = json.load(f)

In [3]:
df = pd.DataFrame(metadata)

In [4]:
df.head(10)

Unnamed: 0,claim,claimant,date,label,related_articles,id
0,A line from George Orwell's novel 1984 predict...,,2017-07-17,0,"[122094, 122580, 130685, 134765]",0
1,Maine legislature candidate Leslie Gibson insu...,,2018-03-17,2,"[106868, 127320, 128060]",1
2,A 17-year-old girl named Alyssa Carson is bein...,,2018-07-18,1,"[132130, 132132, 149722]",4
3,In 1988 author Roald Dahl penned an open lette...,,2019-02-04,2,"[123254, 123418, 127464]",5
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6
5,"Rhode Island is ""almost dead last"" among North...",Leonidas Raptakis,2014-02-11,2,"[8284, 3768, 20091, 82368, 73148, 4493]",7
6,The poorest counties in the U.S. are in Appala...,Jim Webb,2014-11-19,1,"[70709, 70708]",8
7,Koch Industries paid the legal fees of George ...,,2013-07-18,0,"[120591, 120592, 127866, 129483]",9
8,"""Minnesota, Michigan, Iowa already have 70 mph...",Robin Vos,2013-08-22,1,"[69547, 80095, 7994, 81116, 77621]",11
9,"""FBI Uniform Crime Report for 2016 shows more ...",Nick Schroer,2017-10-17,1,"[72012, 26005, 43481, 55671]",12


In [5]:
len(df)

15555

# Preparing the text data

In [6]:
import string
import gensim
from nltk.tokenize import word_tokenize

EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 1000

In [7]:
claims = list()
lines = df['claim'].values.tolist()

In [8]:
for line in lines:
    tokens = word_tokenize(line)
    # convert to lowercase
    tokens = [w.lower() for w in tokens]
    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    claims.append(words)

In [9]:
len(claims)

15555

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(claims)
sequences = tokenizer.texts_to_sequences(claims)

In [12]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 21392 unique tokens.


In [13]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [14]:
labels = df['label'].values
labels = to_categorical(np.asarray(labels))

In [15]:
print("Shape of data tensor:", data.shape)

Shape of data tensor: (15555, 1000)


In [16]:
print("Shape of label tensor:", labels.shape)

Shape of label tensor: (15555, 3)


In [17]:
# split the data into a training set and a validation set
VALIDATION_SPLIT = 0.2

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [18]:
X_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
X_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [19]:
print("Shape of X_train tensor:", X_train.shape)
print("Shape of y_train tensor:", y_train.shape)

print("Shape of X_val tensor:", X_val.shape)
print("Shape of y_val tensor:", y_val.shape)

Shape of X_train tensor: (12444, 1000)
Shape of y_train tensor: (12444, 3)
Shape of X_val tensor: (3111, 1000)
Shape of y_val tensor: (3111, 3)


# Preparing the embedding layer

In [20]:
embeddings_index = {}
with open("../reference/glove.6B.300d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [21]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

In [22]:
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [23]:
from keras.layers.embeddings import Embedding

In [24]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# Training a 1D convnet

In [25]:
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Dropout, Flatten, LSTM, GRU
from keras.layers import Conv1D, MaxPooling1D

In [26]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

In [47]:
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = Dropout(0.4)(x)
x = MaxPooling1D(5)(x)
x = Dropout(0.4)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = Dropout(0.4)(x)
x = MaxPooling1D(5)(x)
x = Dropout(0.4)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = Dropout(0.4)(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)

In [48]:
preds = Dense(3, activation='softmax')(x)

In [49]:
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [50]:
model.fit(X_train, y_train, validation_data=(X_val, y_val),
          epochs=10, batch_size=32)

Train on 12444 samples, validate on 3111 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f4632f5b668>