In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
import numpy as np
import pandas as pd
import time

In [0]:
from nltk import word_tokenize

In [0]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dropout, Bidirectional, BatchNormalization, Flatten, Dense, CuDNNLSTM
from keras.models import Model
from keras.callbacks import ModelCheckpoint, TensorBoard

Using TensorFlow backend.


In [0]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [0]:
import pickle

In [0]:
RANDOM_STATE = 5

## Load embeddings

Glove embeddings source: https://www.kaggle.com/watts2/glove6b50dtxt

In [0]:
EMBEDDINGS_DIM=50

In [0]:
#generate from glove text file
word_to_idx = {}
idx_to_word = {}
idx_to_vector = {}
f = open('/content/drive/My Drive/Hackathon-jan2019/glove.6B.50d.txt',encoding='utf8')
for i,line in enumerate(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_to_idx[word] = i
    idx_to_word[i] = word
    idx_to_vector[i] = coefs
f.close()

In [0]:
n = len(word_to_idx)
n

400000

In [0]:
word_to_idx['UNK'] = n
idx_to_word[n] = 'UNK'
idx_to_vector[n] = np.zeros(EMBEDDINGS_DIM)

In [0]:
VOCAB_SIZE = n+1

In [0]:
with open('/content/drive/My Drive/hackthenews/word_to_idx.pickle', 'wb') as handle:
    pickle.dump(word_to_idx, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('/content/drive/My Drive/hackthenews/idx_to_word.pickle', 'wb') as handle:
    pickle.dump(idx_to_word, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('/content/drive/My Drive/hackthenews/idx_to_vector.pickle', 'wb') as handle:
    pickle.dump(idx_to_vector, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
#or load from saved pickles
with open('/content/drive/My Drive/hackthenews/word_to_idx.pickle', 'rb') as handle:
    word_to_idx = pickle.load(handle)
    
with open('/content/drive/My Drive/hackthenews/idx_to_word.pickle', 'rb') as handle:
    idx_to_word = pickle.load(handle)
    
with open('/content/drive/My Drive/hackthenews/idx_to_vector.pickle', 'rb') as handle:
    idx_to_vector = pickle.load(handle)
    
VOCAB_SIZE = len(word_to_idx)

In [0]:
VOCAB_SIZE

400001

In [0]:
#explore embeddings
for i,j in enumerate(word_to_idx.items()):
    print(j[0])
    if i>100:
        print(len(j[1]))
        break

the
,
.
of
to
and
in
a
"
's
for
-
that
on
is
was
said
with
he
as
it
by
at
(
)
from
his
''
``
an
be
has
are
have
but
were
not
this
who
they
had
i
which
will
their
:
or
its
one
after
new
been
also
we
would
two
more
'
first
about
up
when
year
there
all
--
out
she
other
people
n't
her
percent
than
over
into
last
some
government
time
$
you
years
if
no
world
can
three
do
;
president
only
state
million
could
us
most
_
against
u.s.
so
them


TypeError: ignored

## Loading and Processing Training Data

In [0]:
train_set_file_name = '/content/drive/My Drive/hackthenews/datasets/task1/task1.train.txt'

In [0]:
articles_id, articles_content, gold_labels = ([], [], [])
with open(train_set_file_name, "r") as f:
        for line in f.readlines():
            article_content, article_id, gold_label = line.rstrip().split("\t")
            articles_id.append(article_id)
            articles_content.append(article_content)
            gold_labels.append(gold_label)
print("Number of documents in the training set: %d"%(len(articles_content)))

Number of documents in the training set: 35993


In [0]:
train = pd.DataFrame({'id':articles_id, 'text': articles_content, 'target': gold_labels})

In [0]:
train.shape

(35993, 3)

In [0]:
train.head()

Unnamed: 0,id,target,text
0,727600136,non-propaganda,"Et tu, Rhody? A recent editorial in the Provi..."
1,731714618,non-propaganda,A recent post in The Farmington Mirror — our t...
2,731714635,non-propaganda,"President Donald Trump, as he often does while..."
3,728627182,non-propaganda,"February is Black History Month, and nothing l..."
4,728627443,non-propaganda,"The snow was so heavy, whipped up by gusting w..."


In [0]:
def process_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    #tokens = np.vectorize(lambda x: word_to_idx.get(x, word_to_idx['UNK']))(tokens) #returns np array
    tokens = list(map(lambda x: word_to_idx.get(x, word_to_idx['UNK']), tokens)) #returns list
    return tokens

In [0]:
def process_df(df):
    df['target'] = df['target'].map({'propaganda':1,'non-propaganda':0})
    df['text']=df['text'].apply(process_text)
    df['len']= df['text'].apply(lambda x: len(x))
    return df

In [0]:
train = process_df(train)

In [0]:
with open('/content/drive/My Drive/hackthenews/task1_train.pickle', 'wb') as handle:
    pickle.dump(train.to_dict(), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
# or reload preprocessed dict from saved pickle
with open('/content/drive/My Drive/hackthenews/task1_train.pickle', 'rb') as handle:
    train = pd.DataFrame(pickle.load(handle))

In [0]:
train.tail(15)

Unnamed: 0,id,target,text,len
35978,784555056,1,"[438, 82, 49, 0, 1084, 3, 0, 136, 98, 2035, 1,...",983
35979,784555422,1,"[6, 396, 709, 1, 7, 400000, 358, 3, 868, 119, ...",854
35980,784556485,1,"[109, 3, 95, 6, 0, 139, 33, 3128, 162, 2714, 6...",858
35981,784855236,1,"[6, 29, 962, 17, 6431, 172, 21063, 1984, 78307...",299
35982,784855886,1,"[2855, 64, 1145, 21, 924, 20487, 76, 147, 1, 4...",1181
35983,785266460,1,"[123042, 45, 400000, 3126, 799, 23024, 31, 29,...",209
35984,785149287,1,"[1773, 827, 1, 3832, 346, 45, 41, 913, 521, 89...",869
35985,785429194,1,"[0, 341, 14, 36, 767, 89, 20, 14, 69, 38, 32, ...",3593
35986,785429583,1,"[0, 50, 196, 246, 1827, 4, 50150, 1003, 6, 0, ...",1461
35987,785430430,1,"[1408, 20, 3071, 1534, 191, 19320, 2, 81, 303,...",666


In [0]:
print(train['len'].sum(),"training tokens")

24078978 training tokens


In [0]:
print(train['target'].sum()/len(train))

0.11171616703247854


In [0]:
train['len'].describe()

count    35993.000000
mean       668.990582
std        585.766131
min          9.000000
25%        314.000000
50%        526.000000
75%        869.000000
max      19093.000000
Name: len, dtype: float64

## MODEL

### Get X,y

In [0]:
SEQUENCE_LENGTH = 1000

In [0]:
def get_X(df, seq_len, unk_idx):
  X = pad_sequences(df['text'].values, maxlen=seq_len, padding='post', truncating='post', value=unk_idx)
  return X

In [0]:
def get_Xy(df, seq_len, unk_idx):
    X = get_X(df, seq_len, unk_idx)
    y = df['target'].values
    return X,y

In [0]:
X, y = get_Xy(train, SEQUENCE_LENGTH, word_to_idx['UNK'])

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)

In [0]:
print(y_train.sum())
print(y_valid.sum())

3206
815


### Baseline Model

In [0]:
embedding_matrix = np.zeros((VOCAB_SIZE,EMBEDDINGS_DIM))
for i,vector in idx_to_vector.items():
    embedding_matrix[i]=vector

In [0]:
embedding_layer = Embedding(VOCAB_SIZE,
                            EMBEDDINGS_DIM,
                            weights=[embedding_matrix],
                            input_length=SEQUENCE_LENGTH,
                            trainable=True)

In [0]:
inp = Input(shape=(SEQUENCE_LENGTH,))
x = embedded_sequences = embedding_layer(inp)
x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x) 
x = BatchNormalization()(x)
x = Dropout(0.2)(x) 
x = Bidirectional(CuDNNLSTM(2, return_sequences=True))(x) 
x = BatchNormalization()(x)
x = Flatten()(x)
#x = Dropout(0.2)(x) 
#x = Bidirectional(CuDNNLSTM(32))(x) 
#x = BatchNormalization()(x)
#x = Dropout(0.2)(x) 
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 50)          20000050  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1000, 128)         59392     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1000, 128)         512       
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000, 128)         0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1000, 4)           2112      
_________________________________________________________________
batch_normalization_2 (Batch (None, 1000, 4)           16        
__________

In [0]:
y_train = y_train.reshape((len(y_train),1))
y_train

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]])

In [0]:
y_valid = y_valid.reshape((len(y_valid),1))

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [0]:
curr_time = str(int(time.time()))
check_point = ModelCheckpoint('/content/drive/My Drive/hackthenews/checkpoints/model'+curr_time+'.hdf5',verbose=True, save_best_only=True)
tensorboard_cb = TensorBoard(log_dir='/Graph', histogram_freq=0, write_graph=True, write_images=True)
model.fit(np.vstack([X_valid,X_train]), np.vstack([y_valid,y_train]), epochs=2, batch_size=32, validation_data=(X_valid,y_valid), verbose=1, callbacks=[tensorboard_cb, check_point])

Train on 35993 samples, validate on 7199 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.02948, saving model to /content/drive/My Drive/hackthenews/checkpoints/model1548597419.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.02948 to 0.02088, saving model to /content/drive/My Drive/hackthenews/checkpoints/model1548597419.hdf5


<keras.callbacks.History at 0x7fddf7d01278>

In [0]:
!tensorboard --logdir path_to_current_dir/Graph 

In [0]:
model.load_weights('/content/drive/My Drive/hackthenews/checkpoints/model1548528103.hdf5')

In [0]:
yhat_train = model.predict(X_train)
yhat_valid = model.predict(X_valid)

In [0]:
yhat_train_b = (yhat_train >= 0.5).astype(int)
print(yhat_train_b.sum())
yhat_valid_b = (yhat_valid >= 0.5).astype(int)
print(yhat_valid_b.sum())

3401
871


In [0]:
def model_scorer(y, yhat, scores=None, prefix="", metrics_dict=None):
  if metrics_dict is None:
      metrics_dict = {'acc': accuracy_score,
                     'precision': precision_score,
                     'recall': recall_score,
                     'f1': f1_score}
  scores_dict = {}
      
  for metric, fn in metrics_dict.items():
      scores_dict[prefix+metric] = fn(y, yhat)
      print(metric,': ',scores_dict[prefix+metric])
      
  if scores is None:
      scores = scores_dict
  else:
      scores.update(scores_dict)

  return scores

In [0]:
scores = {'desc': '64u BLSTM layer, batchnorm, 0.2 dropout, 2u BLSTM layer return seq, batchnorm, flatten, 1 dense. Pos Threshold=0.5. Trainable embeddings. 5 epochs'}
scores['model_checkpoint'] = curr_time
print('Training..')
scores = model_scorer(y_train, yhat_train_b, scores, prefix='train_')
print()
print('Validation..')
scores = model_scorer(y_valid, yhat_valid_b, scores, prefix='valid_')

Training..
acc :  0.99781204417587
precision :  0.9866831836481883
recall :  0.9937616968184654
f1 :  0.9902097902097902

Validation..
acc :  0.9969440200027782
precision :  0.9771359807460891
recall :  0.996319018404908
f1 :  0.9866342648845687


In [0]:
yhat_b = np.vstack([yhat_valid_b,yhat_train_b])

In [0]:
scores = {'desc': '64u BLSTM layer, batchnorm, 0.2 dropout, 2u BLSTM layer return seq, batchnorm, flatten, 1 dense. Pos Threshold=0.5. Trainable embeddings. 7 epochs'}
scores['model_checkpoint'] = curr_time
print('Training..')
scores = model_scorer(np.vstack([y_valid,y_train]), yhat_b, scores, prefix='train_')

Training..
acc :  0.9930264218042397
precision :  0.9412453183520599
recall :  1.0
f1 :  0.9697335101893163


In [0]:
scores

{'desc': '64u BLSTM layer, batchnorm, 0.2 dropout, 2u BLSTM layer return seq, batchnorm, flatten, 1 dense. Pos Threshold=0.5. Trainable embeddings. 7 epochs',
 'model_checkpoint': '1548597419',
 'train_acc': 0.9930264218042397,
 'train_f1': 0.9697335101893163,
 'train_precision': 0.9412453183520599,
 'train_recall': 1.0}

In [0]:
scores_df = pd.read_csv('/content/drive/My Drive/hackthenews/scores.csv', index_col=0)
scores_df = scores_df.append(pd.Series(scores).rename(len(scores_df)))
scores_df.to_csv('/content/drive/My Drive/hackthenews/scores.csv')

In [0]:
scores_df

Unnamed: 0,desc,train_acc,train_f1,train_precision,train_recall,valid_acc,valid_f1,valid_precision,valid_recall,model_checkpoint
0,"baseline model, 64u LSTM layer, 0.2 dropout, 1...",0.887592,0.010398,0.809524,0.005232,0.89247,0.002577,0.25,0.001295,
1,"baseline model, 64u LSTM layer, 0.2 dropout, 1...",0.858066,0.330822,0.353518,0.310865,0.863434,0.334462,0.350355,0.319948,
2,"baseline model, 64u Bidirection LSTM layer, no...",0.918438,0.547417,0.732336,0.437058,0.91581,0.512862,0.675847,0.413212,
3,"baseline model, 64u Bidirection LSTM layer, no...",0.934452,0.69677,0.728985,0.667282,0.925813,0.63772,0.669516,0.608808,
4,"64u BLSTM layer, batchnorm, 0.2 dropout, 32u B...",0.89176,0.63194,0.512747,0.82333,0.87955,0.576039,0.462687,0.762953,
5,same as id 3,0.994338,0.974787,0.979789,0.969837,0.955126,0.78072,0.820257,0.744819,1548510000.0
6,"64u BLSTM layer, batchnorm, 0.2 dropout, 64u B...",0.925559,0.698381,0.643413,0.76362,0.912754,0.636994,0.575157,0.713731,1548520000.0
7,"64u BLSTM layer, batchnorm, 0.2 dropout, 64u B...",0.946922,0.752511,0.794188,0.714989,0.941789,0.713993,0.75469,0.677461,1548520000.0
8,"64u BLSTM layer, batchnorm, 0.2 dropout, 64u B...",0.964916,0.847846,0.830333,0.866113,0.95082,0.775949,0.758663,0.794041,1548520000.0
9,"64u BLSTM layer, batchnorm, 0.2 dropout, 2u BL...",0.971967,0.883835,0.830178,0.944906,0.948597,0.784633,0.712474,0.873057,1548530000.0


In [0]:
for i in range(len(scores_df)):
  print(scores_df['desc'].iloc[i])

baseline model, 64u LSTM layer, 0.2 dropout, 1 dense. Pos Threshold=0.5
baseline model, 64u LSTM layer, 0.2 dropout, 1 dense. Pos Threshold=0.3
baseline model, 64u Bidirection LSTM layer, no dropout, 1 dense. Pos Threshold=0.5
baseline model, 64u Bidirection LSTM layer, no dropout, 1 dense. Pos Threshold=0.5, Trainiable embeddings
64u BLSTM layer, batchnorm, 0.2 dropout, 32u BLSTM, batchnorm, 1 dense. Pos Threshold=0.5. Trainable embeddings
same as id 3
64u BLSTM layer, batchnorm, 0.2 dropout, 64u BLSTM, batchnorm, 0.2 dropout, 1 dense. Pos Threshold=0.5. Untrainable embeddings
64u BLSTM layer, batchnorm, 0.2 dropout, 64u BLSTM, batchnorm, 0.2 dropout, 32u BLSTM, batchnorm, no dropout, 1 dense. Pos Threshold=0.5. Untrainable embeddings. 3 epochs
64u BLSTM layer, batchnorm, 0.2 dropout, 64u BLSTM, batchnorm, 0.2 dropout, 32u BLSTM, batchnorm, no dropout, 1 dense. Pos Threshold=0.5. Untrainable embeddings. 5 epochs
64u BLSTM layer, batchnorm, 0.2 dropout, 2u BLSTM layer return seq, batch

## Using Model on Task 2

In [0]:
with open('/content/drive/My Drive/hackthenews/task2_train.pickle', 'rb') as handle:
  articles_id, sentence_id_list, sentence_list, gold_labels = pickle.load(handle)

In [0]:
processed_sentences = [process_text(sen) for sen in sentence_list[:1000]]

In [0]:
processed_sentences = pad_sequences(processed_sentences, maxlen=SEQUENCE_LENGTH, padding='post', truncating='post', value=word_to_idx['UNK'])

In [0]:
processed_sentences

array([[  7617, 293490,      9, ..., 400000, 400000, 400000],
       [400000, 400000, 400000, ..., 400000, 400000, 400000],
       [     0,   2661,  17544, ..., 400000, 400000, 400000],
       ...,
       [   102,   1579,    105, ..., 400000, 400000, 400000],
       [   738,    119,  11663, ..., 400000, 400000, 400000],
       [ 37262,     40,    411, ..., 400000, 400000, 400000]], dtype=int32)

In [0]:
yhat_sentences = model.predict(processed_sentences)

In [0]:
y_sentences = list(map({'propaganda':1,'non-propaganda':0}.get, gold_labels[:1000]))

In [0]:
sum(y_sentences)

171

In [0]:
yhat_sentences_b = (yhat_sentences>=0.1).astype(int)
print(yhat_sentences_b.sum())

139


In [0]:
s = model_scorer(y_sentences, yhat_sentences_b)

acc :  0.76
precision :  0.2517985611510791
recall :  0.2046783625730994
f1 :  0.22580645161290322


## Dev Set

In [0]:
dev_set_file_name = '/content/drive/My Drive/hackthenews/datasets/dev/task1/task1.dev.txt'

In [0]:
dev_articles_content, dev_articles_id = ([], [])
with open(dev_set_file_name) as f:
    for line in f.readlines():
            article_content, article_id, gold_label = line.rstrip().split("\t")
            dev_articles_content.append(article_content)
            dev_articles_id.append(article_id)

In [0]:
dev = pd.DataFrame({'id': dev_articles_id, 'text':dev_articles_content})

In [0]:
dev['target'] = [np.NaN]*len(dev)

In [0]:
dev.shape

(5116, 3)

In [0]:
dev.head()

Unnamed: 0,id,text,target
0,200017,"Building a quick beat, the North Chicago poet ...",
1,200036,If you needed a further reminder of how Metro ...,
2,200038,Taylor Swift famously withheld her back catalo...,
3,200086,May 14 squall line on radar. (Radarscope) Mon...,
4,200113,A pre-emptive attack by the United States agai...,


In [0]:
dev = process_df(dev)

In [0]:
dev.head()

Unnamed: 0,id,text,target,len
0,200017,"[447, 7, 2582, 960, 1, 0, 193, 1147, 4819, 136...",,844
1,200036,"[83, 81, 911, 7, 489, 8889, 3, 197, 5083, 4859...",,243
2,200038,"[2485, 6596, 11689, 13147, 71, 137, 13257, 25,...",,1075
3,200086,"[107, 657, 55053, 331, 13, 5366, 2, 23, 400000...",,1141
4,200113,"[7, 100893, 436, 21, 0, 104, 112, 98, 193, 574...",,497


In [0]:
with open('/content/drive/My Drive/hackthenews/task1_dev.pickle', 'wb') as handle:
    pickle.dump(dev.to_dict(), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
# or reload preprocessed dict from saved pickle
with open('/content/drive/My Drive/hackthenews/task1_dev.pickle', 'rb') as handle:
    dev = pd.DataFrame(pickle.load(handle))

In [0]:
X_dev = get_X(dev, SEQUENCE_LENGTH, word_to_idx['UNK'])

In [0]:
yhat_dev = model.predict(X_dev)

In [0]:
yhat_dev_b = (yhat_dev>=0.5).astype(int)
print(yhat_dev_b.sum())

578


In [0]:
labels = {1:'propaganda', 0:'non-propaganda'}
with open("/content/drive/My Drive/hackthenews/task1-dev-predictions.txt", "w") as fout:
  for i, idx in enumerate(dev['id']):
    fout.write("%s\t%s\n" % (idx, labels[yhat_dev_b[i][0]]))


In [0]:
labels = {1:'propaganda', 0:'non-propaganda'}
with open("/content/drive/My Drive/task1-dev-predictions.txt", "w") as fout:
  for i, idx in enumerate(dev['id']):
    fout.write("%s\t%s\n" % (idx, labels[yhat_dev_b[i][0]]))


## Test Set

In [0]:
test_set_file_name = '/content/drive/My Drive/hackthenews/datasets/test/task1/task1.test.txt'

In [0]:
test_articles_content, test_articles_id = ([], [])
with open(test_set_file_name) as f:
    for line in f.readlines():
            article_content, article_id, gold_label = line.rstrip().split("\t")
            test_articles_content.append(article_content)
            test_articles_id.append(article_id)

In [0]:
test = pd.DataFrame({'id': test_articles_id, 'text':test_articles_content})

In [0]:
test['target'] = [np.NaN]*len(test)

In [0]:
test.shape

(10158, 3)

In [0]:
test.head()

Unnamed: 0,id,text,target
0,100013,Chicago police are seeking the public’s help i...,
1,100015,Moment of silence at Palestine Pavilion during...,
2,100024,The Organization of Islamic Cooperation (OIC) ...,
3,100031,The chairman of the House Oversight and Govern...,
4,100040,A judge in the United Kingdom has sentenced a ...,


In [0]:
test = process_df(test)

In [0]:
test.head()

Unnamed: 0,id,text,target,len
0,100013,"[1147, 142, 32, 1309, 0, 198, 3071, 1534, 275,...",,86
1,100015,"[1600, 3, 5236, 22, 5469, 10605, 105, 10707, 3...",,248
2,100024,"[0, 734, 3, 959, 969, 23, 17847, 24, 43, 802, ...",,153
3,100031,"[0, 663, 3, 0, 166, 6751, 5, 78, 1296, 446, 68...",,530
4,100040,"[7, 924, 6, 0, 104, 1859, 31, 2579, 7, 4756, 5...",,351


In [0]:
with open('/content/drive/My Drive/hackthenews/task1_test.pickle', 'wb') as handle:
    pickle.dump(test.to_dict(), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
# or reload preprocessed dict from saved pickle
with open('/content/drive/My Drive/hackthenews/task1_test.pickle', 'rb') as handle:
    test = pd.DataFrame(pickle.load(handle))

In [0]:
X_test = get_X(test, SEQUENCE_LENGTH, word_to_idx['UNK'])

In [0]:
yhat_test = model.predict(X_test)

In [0]:
yhat_test_b = (yhat_test>=0.5).astype(int)
print(yhat_test_b.sum())

1475


In [0]:
labels = {1:'propaganda', 0:'non-propaganda'}
with open("/content/drive/My Drive/hackthenews/task1-test-predictions.txt", "w") as fout:
  for i, idx in enumerate(test['id']):
    fout.write("%s\t%s\n" % (idx, labels[yhat_test_b[i][0]]))


In [0]:
labels = {1:'propaganda', 0:'non-propaganda'}
with open("/content/drive/My Drive/task1-test-predictions.txt", "w") as fout:
  for i, idx in enumerate(test['id']):
    fout.write("%s\t%s\n" % (idx, labels[yhat_test_b[i][0]]))


# Ideas

Further exploration;
* NER and POS tags as input

In [0]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

In [0]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']