In [1]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Softmax, Dropout
from keras import backend as K
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers

import numpy as np
import numpy.random as random
from nltk.tokenize import word_tokenize

from sklearn.metrics import confusion_matrix
from sklearn.utils import class_weight
import pandas as pd
import numpy as np 
import math

Using TensorFlow backend.


In [2]:
UNK_TOKEN = '__unk__'
MAX_SEQUENCE_LENGTH = 50
VOCAB_SIZE = 1500




In [3]:
# get fake.csv from kaggle dataset
data_read = pd.read_csv('fake.csv')
data = data_read.sample(frac=1)

split1 = int(math.ceil(len(data)*0.6))
split2 = int(math.ceil(len(data)*0.8))
train = data[:split1]
val = data[split1+1:split2]
test = data[split2+1:]

print(data['type'].unique())
print(len(val))
print(len(train))
print(len(test))

['bias' 'bs' 'conspiracy' 'satire' 'hate' 'fake' 'junksci' 'state']
2599
7800
2598


# Preprocessing

In [4]:
train_label_str, train_data = train['type'].values.tolist(), train['title'].values.tolist()
val_label_str, val_data = val['type'].values.tolist(), val['title'].values.tolist()
test_label_str, test_data = test['type'].values.tolist(), test['title'].values.tolist()
type_d = data['type'].unique()
myclass = []
mylabels = {}
emotions = {
	"bias":0,
	"conspiracy":1,
	"fake":2,
	"bs":3,
	"satire":4,
	"hate":5,
	"junksci":6,
	"state":7
}
for mytype in data['type'].values.tolist():
    for key,value in emotions.items():
        if mytype == key:
            myclass.append(value)
            mylabels[value] = key
train_labels = myclass[:split1]
val_labels = myclass[split1+1:split2]
test_labels = myclass[split2+1:]
len(emotions)
print(len(train_labels))

7800


In [5]:

# Preprocess data
def preprocess(data):
    # Tokenize (separate by space, punctuation etc) and lowercase all data
    return [word_tokenize(str(t).decode('utf-8').strip().lower()) for t in data]

train_data = preprocess(train_data)
val_data = preprocess(val_data)
test_data = preprocess(test_data)
print(train_data[:1])

[[u'kenyan', u'refugee', u'kills', u'co-worker', u',', u'self']]


In [6]:
# Prepare vocabulary
full_vocab = dict()
for instance in train_data:
    for token in instance:
        full_vocab[token] = 1 + full_vocab.get(token, 0)

# Sort vocabulary by occurence
sorted_vocab = sorted(full_vocab.keys(), key=lambda word: -full_vocab[word])

# Print some samples
print("Vocabulary size: %d"%(len(sorted_vocab)))
print("Most frequent tokens")
for i in range(10):
    print("\t%s: %d"%(sorted_vocab[i], full_vocab[sorted_vocab[i]]))
print("Least frequent tokens")
for i in range(1,11):
    print("\t%s: %d"%(sorted_vocab[-i], full_vocab[sorted_vocab[-i]]))

# We can choose to limit the vocab_size here to only a portion of the original vocab,
# i.e. ignore infrequent tokens to save on memory
vocab_size = VOCAB_SIZE
    
# Create final vocab
word2idx = {w: idx for idx, w in enumerate(sorted_vocab[:vocab_size])}
idx2word = {idx: w for idx, w in enumerate(sorted_vocab[:vocab_size])}


word2idx[UNK_TOKEN] = vocab_size
idx2word[vocab_size] = UNK_TOKEN
vocab_size = vocab_size + 1

Vocabulary size: 14331
Most frequent tokens
	the: 2546
	:: 1928
	to: 1924
	’: 1843
	,: 1448
	of: 1412
	in: 1201
	trump: 1052
	s: 952
	a: 926
Least frequent tokens
	corey: 1
	irony: 1
	yes…: 1
	11/4/16: 1
	zenit: 1
	latakia: 1
	الكويتي: 1
	google/amazon: 1
	lui-même: 1
	cliff: 1


In [7]:
train_data = [[t if t in word2idx else UNK_TOKEN for t in instance] for instance in train_data]
val_data = [[t if t in word2idx else UNK_TOKEN for t in instance] for instance in val_data]
test_data = [[t if t in word2idx else UNK_TOKEN for t in instance] for instance in test_data]

print("Number of tokens filtered out as unknown:")
print("Train: %d/%d"%(len([1 for instance in train_data for t in instance if t == UNK_TOKEN]), sum([len(i) for i in train_data])))
print("val: %d/%d"%(len([1 for instance in val_data for t in instance if t == UNK_TOKEN]), sum([len(i) for i in val_data])))
print("Test: %d/%d"%(len([1 for instance in test_data for t in instance if t == UNK_TOKEN]), sum([len(i) for i in test_data])))

Number of tokens filtered out as unknown:
Train: 23892/92733
val: 8563/31434
Test: 8620/31172


In [8]:
## data_to_tensor
# Given a list of instances, where each instance is a list of tokens,
# this function does the following:
# 1: Replace each token with its corresponding index
# 2: Pad sequences to MAX_SEQUENCE_LENGTH (or truncate them if longer)
#       Padding is done with a unique element, in this case `vocab_size`
#       The network will learn that this unique element is padding and does not
#        mean anything semantically
# 3: Package everything nicely as a NUM_INSTANCES x MAX_SEQUENCE_LENGTH matrix
def data_to_tensor(data):
    # First convert from words to indices
    idx_data = [[word2idx[t] for t in instance] for instance in data]
    
    # Create numpy representation
    return pad_sequences([np.array(d) for d in idx_data], maxlen=MAX_SEQUENCE_LENGTH, dtype='int32', padding='pre', truncating='pre', value=vocab_size)

X_train = data_to_tensor(train_data)
y_train = to_categorical(np.array(train_labels, dtype=np.int))
print(np.unique(train_labels))

X_val = data_to_tensor(val_data)
y_val = to_categorical(np.array(val_labels, dtype=np.int))
print(np.unique(val_labels))
X_test = data_to_tensor(test_data)
y_test = to_categorical(np.array(test_labels, dtype=np.int))
print(np.unique(test_labels))

vocab_size = vocab_size + 1 # Add 1 for the padding token
print(vocab_size)

[0 1 2 3 4 5 6 7]
[0 1 2 3 4 5 6 7]
[0 1 3 4 5 6 7]
1502


In [9]:
def idx_to_bow(data):
    data_bow = np.zeros((data.shape[0], vocab_size)) 
    for i in range(data.shape[1]):
        data_bow[np.arange(data.shape[0]), data[:, i]] = 1
    return data_bow

X_train_bow = idx_to_bow(X_train)
X_val_bow = idx_to_bow(X_val)
X_test_bow = idx_to_bow(X_test)
print(len(emotions))

8


In [10]:
model = Sequential()
model.add(Dense(25, input_shape=(vocab_size,), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.25))
model.add(Dense(len(emotions)))
model.add(Softmax())
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 25)                37575     
_________________________________________________________________
dropout_1 (Dropout)          (None, 25)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 208       
_________________________________________________________________
softmax_1 (Softmax)          (None, 8)                 0         
Total params: 37,783
Trainable params: 37,783
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.fit(X_train_bow, y_train, epochs=100, validation_data=(X_val_bow, y_val))

Train on 7800 samples, validate on 2599 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x103ccc8d0>