**Note: this notebook was executed on Google Colab**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import pandas as pd
import time

In [0]:
from nltk import word_tokenize
from spacy.lang.en import English

In [0]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dropout, Bidirectional, BatchNormalization, Flatten, Dense, CuDNNLSTM
from keras.models import Model
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

Using TensorFlow backend.


In [0]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [0]:
import pickle

In [0]:
RANDOM_STATE = 5

## Load embeddings

Glove embeddings source: https://www.kaggle.com/watts2/glove6b50dtxt

In [0]:
EMBEDDINGS_DIM=50

In [0]:
#generate from glove text file
word_to_idx = {}
idx_to_word = {}
idx_to_vector = {}
f = open('/content/drive/My Drive/hackthenews/glove.6B.50d.txt',encoding='utf8')
for i,line in enumerate(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_to_idx[word] = i
    idx_to_word[i] = word
    idx_to_vector[i] = coefs
f.close()

In [0]:
n = len(word_to_idx)
word_to_idx['UNK'] = n
idx_to_word[n] = 'UNK'
idx_to_vector[n] = np.zeros(EMBEDDINGS_DIM)

In [0]:
n = len(word_to_idx)
word_to_idx['URL'] = n
idx_to_word[n] = 'URL'
idx_to_vector[n] = np.zeros(EMBEDDINGS_DIM)

In [0]:
VOCAB_SIZE = len(word_to_idx)
VOCAB_SIZE

400002

In [0]:
with open('/content/drive/My Drive/hackthenews/word_to_idx.pickle', 'wb') as handle:
    pickle.dump(word_to_idx, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('/content/drive/My Drive/hackthenews/idx_to_word.pickle', 'wb') as handle:
    pickle.dump(idx_to_word, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('/content/drive/My Drive/hackthenews/idx_to_vector.pickle', 'wb') as handle:
    pickle.dump(idx_to_vector, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
#or load from saved pickles
with open('/content/drive/My Drive/hackthenews/word_to_idx.pickle', 'rb') as handle:
    word_to_idx = pickle.load(handle)
    
with open('/content/drive/My Drive/hackthenews/idx_to_word.pickle', 'rb') as handle:
    idx_to_word = pickle.load(handle)
    
with open('/content/drive/My Drive/hackthenews/idx_to_vector.pickle', 'rb') as handle:
    idx_to_vector = pickle.load(handle)
    
VOCAB_SIZE = len(word_to_idx)

In [0]:
VOCAB_SIZE

400002

In [0]:
#explore embeddings
for i,j in enumerate(word_to_idx.items()):
    print(j[0])
    if i>100:
        print(len(j[1]))
        break

the
,
.
of
to
and
in
a
"
's
for
-
that
on
is
was
said
with
he
as
it
by
at
(
)
from
his
''
``
an
be
has
are
have
but
were
not
this
who
they
had
i
which
will
their
:
or
its
one
after
new
been
also
we
would
two
more
'
first
about
up
when
year
there
all
--
out
she
other
people
n't
her
percent
than
over
into
last
some
government
time
$
you
years
if
no
world
can
three
do
;
president
only
state
million
could
us
most
_
against
u.s.
so
them


TypeError: ignored

## Loading and Processing Training Data

In [0]:
train_set_file_name = '/content/drive/My Drive/hackthenews/datasets/task1/task1.train.txt'

In [0]:
articles_id, articles_content, gold_labels = ([], [], [])
with open(train_set_file_name, "r") as f:
        for line in f.readlines():
            article_content, article_id, gold_label = line.rstrip().split("\t")
            articles_id.append(article_id)
            articles_content.append(article_content)
            gold_labels.append(gold_label)
print("Number of documents in the training set: %d"%(len(articles_content)))

Number of documents in the training set: 35993


In [0]:
train = pd.DataFrame({'id':articles_id, 'text': articles_content, 'target': gold_labels})

In [0]:
train.shape

(35993, 3)

In [0]:
train.head()

Unnamed: 0,id,target,text
0,727600136,non-propaganda,"Et tu, Rhody? A recent editorial in the Provi..."
1,731714618,non-propaganda,A recent post in The Farmington Mirror — our t...
2,731714635,non-propaganda,"President Donald Trump, as he often does while..."
3,728627182,non-propaganda,"February is Black History Month, and nothing l..."
4,728627443,non-propaganda,"The snow was so heavy, whipped up by gusting w..."


In [0]:
def process_text(text):
    parser = English()
    tokens = []
    for token in parser(text):
        if token.orth_.isspace():
            continue
        elif token.like_url:
            tokens.append('URL')
        else:
            tokens.append(token.lower_)
    #tokens = np.vectorize(lambda x: word_to_idx.get(x, word_to_idx['UNK']))(tokens) #returns np array
    tokens = list(map(lambda x: word_to_idx.get(x, word_to_idx['UNK']), tokens)) #returns list
    return tokens

In [0]:
def process_df(df):
    df['target'] = df['target'].map({'propaganda':1,'non-propaganda':0})
    df['text']=df['text'].apply(process_text)
    df['len']= df['text'].apply(lambda x: len(x))
    return df

In [0]:
train = process_df(train)

In [0]:
with open('/content/drive/My Drive/hackthenews/task1_train_spacy.pickle', 'wb') as handle:
    pickle.dump(train.to_dict(), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
# or reload preprocessed dict from saved pickle
with open('/content/drive/My Drive/hackthenews/task1_train_spacy.pickle', 'rb') as handle:
    train = pd.DataFrame(pickle.load(handle))

In [0]:
train.shape

(35993, 4)

In [0]:
train.tail(15)

Unnamed: 0,id,target,text,len
35978,784555056,1,"[438, 82, 49, 0, 1084, 3, 0, 136, 98, 2035, 1,...",1003
35979,784555422,1,"[6, 396, 709, 1, 7, 742, 11, 321, 358, 3, 868,...",861
35980,784556485,1,"[109, 3, 95, 6, 0, 139, 33, 3128, 162, 2714, 6...",881
35981,784855236,1,"[6, 29, 962, 17, 6431, 172, 1744, 11, 6, 11, 2...",308
35982,784855886,1,"[2855, 64, 1145, 21, 924, 20487, 76, 147, 1, 4...",1179
35983,785266460,1,"[400001, 3126, 799, 23024, 31, 29, 2048, 4522,...",218
35984,785149287,1,"[1773, 827, 1, 3832, 346, 45, 41, 913, 521, 89...",877
35985,785429194,1,"[0, 341, 14, 36, 767, 89, 20, 14, 69, 38, 32, ...",3624
35986,785429583,1,"[0, 50, 196, 246, 1827, 4, 50150, 1003, 6, 0, ...",1468
35987,785430430,1,"[1408, 20, 784, 191, 19320, 2, 81, 303, 4, 146...",671


In [0]:
print(train['len'].sum(),"training tokens")

24359033 training tokens


In [0]:
print(train['target'].sum()/len(train))

0.11171616703247854


In [0]:
train['len'].describe()

count    35993.000000
mean       676.771400
std        590.370924
min          9.000000
25%        319.000000
50%        533.000000
75%        879.000000
max      20125.000000
Name: len, dtype: float64

## MODEL

### Get X,y

In [0]:
SEQUENCE_LENGTH = 1000

In [0]:
def get_X(df, seq_len, unk_idx):
    X = pad_sequences(df['text'].values, maxlen=seq_len, padding='post', truncating='post', value=unk_idx)
    return X

In [0]:
def get_Xy(df, seq_len, unk_idx):
    X = get_X(df, seq_len, unk_idx)
    y = df['target'].values
    return X,y

In [0]:
X, y = get_Xy(train, SEQUENCE_LENGTH, word_to_idx['UNK'])

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)

In [0]:
print(y_train.sum())
print(y_valid.sum())

3206
815


### Model

In [0]:
embedding_matrix = np.zeros((VOCAB_SIZE,EMBEDDINGS_DIM))
for i,vector in idx_to_vector.items():
    embedding_matrix[i]=vector

In [0]:
embedding_layer = Embedding(VOCAB_SIZE,
                            EMBEDDINGS_DIM,
                            weights=[embedding_matrix],
                            input_length=SEQUENCE_LENGTH,
                            trainable=True)

In [0]:
#'64u BLSTM layer, batchnorm, 0.2 dropout, 2u BLSTM layer return seq, batchnorm, flatten, 1 dense. Pos Threshold=0.5. Trainable embeddings. 3 epochs'
inp = Input(shape=(SEQUENCE_LENGTH,))
x = embedded_sequences = embedding_layer(inp)
x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x) 
x = BatchNormalization()(x)
x = Dropout(0.2)(x) 
x = Bidirectional(CuDNNLSTM(2, return_sequences=True))(x) 
x = BatchNormalization()(x)
x = Flatten()(x)
#x = Dropout(0.2)(x) 
#x = Bidirectional(CuDNNLSTM(32))(x) 
#x = BatchNormalization()(x)
#x = Dropout(0.2)(x) 
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 50)          20000100  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 1000, 128)         59392     
_________________________________________________________________
batch_normalization_3 (Batch (None, 1000, 128)         512       
_________________________________________________________________
dropout_2 (Dropout)          (None, 1000, 128)         0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 1000, 4)           2112      
_________________________________________________________________
batch_normalization_4 (Batch (None, 1000, 4)           16        
__________

In [0]:
curr_time = str(int(time.time()))
check_point = ModelCheckpoint('/content/drive/My Drive/hackthenews/checkpoints/model'+curr_time+'.hdf5',verbose=True, save_best_only=True)
early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=True, restore_best_weights=True)
tensorboard_cb = TensorBoard(log_dir='/Graph', histogram_freq=0, write_graph=True, write_images=True)
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_valid,y_valid), verbose=1, callbacks=[early_stop, tensorboard_cb, check_point])

Train on 28794 samples, validate on 7199 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.53584, saving model to /content/drive/My Drive/hackthenews/checkpoints/model1553766134.hdf5
Epoch 2/5

Epoch 00002: val_loss improved from 0.53584 to 0.33829, saving model to /content/drive/My Drive/hackthenews/checkpoints/model1553766134.hdf5
Epoch 3/5

Epoch 00003: val_loss improved from 0.33829 to 0.13936, saving model to /content/drive/My Drive/hackthenews/checkpoints/model1553766134.hdf5
Epoch 4/5

Epoch 00004: val_loss did not improve from 0.13936
Epoch 5/5
Restoring model weights from the end of the best epoch

Epoch 00005: val_loss did not improve from 0.13936
Epoch 00005: early stopping


<keras.callbacks.History at 0x7fa406f0d668>

In [0]:
!tensorboard --logdir path_to_current_dir/Graph 

In [0]:
#model.load_weights('/content/drive/My Drive/hackthenews/checkpoints/model1548528103.hdf5')

In [0]:
yhat_train = model.predict(X_train)
yhat_valid = model.predict(X_valid)

In [0]:
yhat_train_b = (yhat_train >= 0.5).astype(int)
print(yhat_train_b.sum())
yhat_valid_b = (yhat_valid >= 0.5).astype(int)
print(yhat_valid_b.sum())

3258
816


In [0]:
def model_scorer(y, yhat, scores=None, prefix="", metrics_dict=None):
    if metrics_dict is None:
        metrics_dict = {'acc': accuracy_score,
                     'precision': precision_score,
                     'recall': recall_score,
                     'f1': f1_score}
    scores_dict = {}

    for metric, fn in metrics_dict.items():
        scores_dict[prefix+metric] = fn(y, yhat)
        print(metric,': ',scores_dict[prefix+metric])

    if scores is None:
        scores = scores_dict
    else:
        scores.update(scores_dict)

    return scores

In [0]:
scores = {'desc': '(model 10 with spacy tokens) 64u BLSTM layer, batchnorm, 0.2 dropout, 2u BLSTM layer return seq, batchnorm, flatten, 1 dense. Pos Threshold=0.5. Trainable embeddings. 5 epochs'}
scores['model_checkpoint'] = curr_time
print('Training..')
scores = model_scorer(y_train, yhat_train_b, scores, prefix='train_')
print()
print('Validation..')
scores = model_scorer(y_valid, yhat_valid_b, scores, prefix='valid_')

Training..
acc :  0.9938876154754462
precision :  0.9650092081031307
recall :  0.9806612601372426
f1 :  0.9727722772277227

Validation..
acc :  0.9615224336713433
precision :  0.8296568627450981
recall :  0.8306748466257668
f1 :  0.8301655426118946


In [0]:
scores

{'desc': '(model 10 with spacy tokens) 64u BLSTM layer, batchnorm, 0.2 dropout, 2u BLSTM layer return seq, batchnorm, flatten, 1 dense. Pos Threshold=0.5. Trainable embeddings. 5 epochs',
 'model_checkpoint': '1553766134',
 'train_acc': 0.9938876154754462,
 'train_f1': 0.9727722772277227,
 'train_precision': 0.9650092081031307,
 'train_recall': 0.9806612601372426,
 'valid_acc': 0.9615224336713433,
 'valid_f1': 0.8301655426118946,
 'valid_precision': 0.8296568627450981,
 'valid_recall': 0.8306748466257668}

In [0]:
scores_df = pd.read_csv('/content/drive/My Drive/hackthenews/new_scores.csv', index_col=0)
scores_df = scores_df.append(pd.Series(scores).rename(len(scores_df)))
scores_df.to_csv('/content/drive/My Drive/hackthenews/new_scores.csv')

In [0]:
scores_df = pd.read_csv('/content/drive/My Drive/hackthenews/new_scores.csv', index_col=0)
scores_df

Unnamed: 0,desc,model_checkpoint,train_acc,train_f1,train_precision,train_recall,valid_acc,valid_f1,valid_precision,valid_recall
0,"(model 10 with spacy tokens) 64u BLSTM layer, ...",1553766134,0.993888,0.972772,0.965009,0.980661,0.961522,0.830166,0.829657,0.830675


In [0]:
for i in range(len(scores_df)):
  print(scores_df['desc'].iloc[i])

(model 10 with spacy tokens) 64u BLSTM layer, batchnorm, 0.2 dropout, 2u BLSTM layer return seq, batchnorm, flatten, 1 dense. Pos Threshold=0.5. Trainable embeddings. 5 epochs


## Dev Set

In [0]:
dev_set_file_name = '/content/drive/My Drive/hackthenews/datasets/dev/task1/task1.dev.txt'

In [0]:
dev_articles_content, dev_articles_id = ([], [])
with open(dev_set_file_name) as f:
    for line in f.readlines():
            article_content, article_id, gold_label = line.rstrip().split("\t")
            dev_articles_content.append(article_content)
            dev_articles_id.append(article_id)

In [0]:
dev = pd.DataFrame({'id': dev_articles_id, 'text':dev_articles_content})

In [0]:
dev['target'] = [np.NaN]*len(dev)

In [0]:
dev.shape

(5116, 3)

In [0]:
dev.head()

Unnamed: 0,id,text,target
0,200017,"Building a quick beat, the North Chicago poet ...",
1,200036,If you needed a further reminder of how Metro ...,
2,200038,Taylor Swift famously withheld her back catalo...,
3,200086,May 14 squall line on radar. (Radarscope) Mon...,
4,200113,A pre-emptive attack by the United States agai...,


In [0]:
dev = process_df(dev)

In [0]:
dev.head()

Unnamed: 0,id,text,target,len
0,200017,"[447, 7, 2582, 960, 1, 0, 193, 1147, 4819, 136...",,844
1,200036,"[83, 81, 911, 7, 489, 8889, 3, 197, 5083, 4859...",,243
2,200038,"[2485, 6596, 11689, 13147, 71, 137, 13257, 25,...",,1075
3,200086,"[107, 657, 55053, 331, 13, 5366, 2, 23, 400000...",,1141
4,200113,"[7, 100893, 436, 21, 0, 104, 112, 98, 193, 574...",,497


In [0]:
with open('/content/drive/My Drive/hackthenews/task1_dev.pickle', 'wb') as handle:
    pickle.dump(dev.to_dict(), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
# or reload preprocessed dict from saved pickle
with open('/content/drive/My Drive/hackthenews/task1_dev.pickle', 'rb') as handle:
    dev = pd.DataFrame(pickle.load(handle))

In [0]:
X_dev = get_X(dev, SEQUENCE_LENGTH, word_to_idx['UNK'])

In [0]:
yhat_dev = model.predict(X_dev)

In [0]:
yhat_dev_b = (yhat_dev>=0.5).astype(int)
print(yhat_dev_b.sum())

578


In [0]:
labels = {1:'propaganda', 0:'non-propaganda'}
with open("/content/drive/My Drive/hackthenews/task1-dev-predictions.txt", "w") as fout:
    for i, idx in enumerate(dev['id']):
        fout.write("%s\t%s\n" % (idx, labels[yhat_dev_b[i][0]]))
