In [43]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:          13022        3346        5546         265        4129       10932
Swap:             0           0           0


In [0]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os
import pandas as pd
import re
import keras.layers as layers

from collections import Counter
from keras import backend as K
from keras.callbacks import TensorBoard
from keras.layers import Input, Embedding, BatchNormalization, LSTM, Dense, Concatenate
from keras.models import Model

from keras.utils import plot_model

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
reliable = pd.read_csv('drive/My Drive/CSV/reliable_mini.csv').drop(columns = ['title', 'authors'])
fake = pd.read_csv('drive/My Drive/CSV/fake_mini.csv').drop(columns = ['title','authors'])

In [0]:
type_dic = {'fake': 0, 'reliable':1}
f = lambda x:type_dic[x]

In [0]:
reliable['type'] = reliable['type'].apply(f)
fake['type'] = fake['type'].apply(f)

In [0]:
train_df = reliable[:6000]
train_df = pd.concat([train_df,fake[:6000]])
train_df.reset_index(drop = True,inplace = True)
                      

In [0]:
test_df = reliable[6000:]
test_df = pd.concat([test_df,fake[6000:]])
test_df.reset_index(drop = True,inplace = True)

In [18]:

test_df['type'].value_counts()

1    4000
0    4000
Name: type, dtype: int64

In [20]:
train_df['type'].value_counts()

1    6000
0    6000
Name: type, dtype: int64

In [0]:
# parameter of max word length
time_steps = 100


# building vocabulary from dataset
def build_vocabulary(sentence_list):
    unique_words = " ".join(sentence_list).strip().split()
    word_count = Counter(unique_words).most_common()
    vocabulary = {}
    for word, _ in word_count:
        vocabulary[word] = len(vocabulary)        

    return vocabulary


# Get vocabulary vectors from document list
# Vocabulary vector, Unknown word is 1 and padding is 0
# INPUT: raw sentence list
# OUTPUT: vocabulary vectors list
def get_voc_vec(document_list, vocabulary):    
    voc_ind_sentence_list = []
    for document in document_list:
        voc_idx_sentence = []
        word_list = document.split()
        
        for w in range(time_steps):
            if w < len(word_list):
                # pickup vocabulary id and convert unknown word into 1
                voc_idx_sentence.append(vocabulary.get(word_list[w], -1) + 2)
            else:
                # padding with 0
                voc_idx_sentence.append(0)
            
        voc_ind_sentence_list.append(voc_idx_sentence)
        
    return np.array(voc_ind_sentence_list)


vocabulary = build_vocabulary(train_df['content'])

In [0]:


# Reduce TensorFlow logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

# Instantiate the elmo model
elmo_module = hub.Module("https://tfhub.dev/google/elmo/1", trainable=False)

# Initialize session
sess = tf.Session()
K.set_session(sess)

K.set_learning_phase(1)

sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [0]:
# mini-batch generator
def batch_iter(data, labels, batch_size, shuffle=True):
    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
    print("batch_size", batch_size)
    print("num_batches_per_epoch", num_batches_per_epoch)

    def data_generator():
        data_size = len(data)

        while True:
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
                shuffled_labels = labels[shuffle_indices]
            else:
                shuffled_data = data
                shuffled_labels = labels

            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                                
                X_voc = get_voc_vec(shuffled_data[start_index: end_index], vocabulary)
                                
                sentence_split_list = []
                sentence_split_length_list = []
            
                for sentence in shuffled_data[start_index: end_index]:    
                    sentence_split = sentence.split()
                    sentence_split_length = len(sentence_split)
                    sentence_split += ["NaN"] * (time_steps - sentence_split_length)
                    
                    sentence_split_list.append((" ").join(sentence_split))
                    sentence_split_length_list.append(sentence_split_length)
        
                X_elmo = np.array(sentence_split_list)

                X = [X_voc, X_elmo]
                y = shuffled_labels[start_index: end_index]
                
                yield X, y

    return num_batches_per_epoch, data_generator()

In [0]:


# embed elmo method
def make_elmo_embedding(x):
    embeddings = elmo_module(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["elmo"]
    
    return embeddings

In [37]:
# elmo embedding dimension
elmo_dim = 1024

# Input Layers
word_input = Input(shape=(None, ), dtype='int32')  # (batch_size, sent_length)
elmo_input = Input(shape=(None, ), dtype="string") #we change 'tf.string)'  # (batch_size, sent_length, elmo_size)

# Hidden Layers
word_embedding = Embedding(input_dim=len(vocabulary), output_dim=128, mask_zero=True)(word_input)
elmo_embedding = layers.Lambda(make_elmo_embedding, output_shape=(None, elmo_dim))(elmo_input)
word_embedding = Concatenate()([word_embedding, elmo_embedding])
word_embedding = BatchNormalization()(word_embedding)
x = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(word_embedding)

# Output Layer
predict = Dense(units=1, activation='sigmoid')(x)


model = Model(inputs=[word_input, elmo_input], outputs=predict)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

model.summary()

#plot_model(model, to_file="model.png", show_shapes=True)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 128)    42909056    input_5[0][0]                    
__________________________________________________________________________________________________
lambda_3 (Lambda)               (None, None, 1024)   0           input_6[0][0]                    
__________________________________________________________________________________________________
concatenat

In [0]:

# Create datasets (Only take up to time_steps words for memory)
train_text = train_df['content'].tolist()
train_text = [' '.join(t.split()[0:time_steps]) for t in train_text]
train_text = np.array(train_text)
train_label = np.array(train_df['type'].tolist())

test_text = test_df['content'].tolist()
test_text = [' '.join(t.split()[0:time_steps]) for t in test_text]
test_text = np.array(test_text)
test_label = np.array(test_df['type'].tolist())

In [39]:

batch_size = 32

train_steps, train_batches = batch_iter(train_text,
                                        np.array(train_df['type']),
                                        batch_size)
valid_steps, valid_batches = batch_iter(test_text,
                                        np.array(test_df['type']),
                                        batch_size)

batch_size 32
num_batches_per_epoch 375
batch_size 32
num_batches_per_epoch 250


In [0]:



logfile_path = './log'

In [41]:
tb_cb = TensorBoard(log_dir=logfile_path, histogram_freq=0)

history = model.fit_generator(train_batches, train_steps,
                              epochs=5, 
                              validation_data=valid_batches,
                              validation_steps=valid_steps,
                              callbacks=[tb_cb])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [29]:
score = model.evaluate_generator(valid_batches, valid_steps)
print('Test score:', score[0])
print('Test accuracy;', score[1])

Test score: 0.12514827745070214
Test accuracy; 0.967


In [42]:
model_json = model.to_json()
with open("drive/My Drive/model96.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("drive/My Drive/model96.h5")
print("Saved model to disk")

Saved model to disk


In [0]:
model.save('drive/My Drive/eLMO.h5')

In [47]:
!ls -lh drive/My\ Drive/

total 666M
-rw------- 1 root root 290K Feb 27  2017 'algo assignment.rar'
drwx------ 2 root root 4.0K Jul 30 06:11  CGA
drwx------ 2 root root 4.0K Feb  7  2018  Classroom
drwx------ 2 root root 4.0K Oct 25 17:42 'Colab Notebooks'
-rw------- 1 root root    1 Apr  9  2018 'Cover and Indexpage - SE.gdoc'
drwx------ 2 root root 4.0K Oct 25 17:37  CSV
-rw------- 1 root root    1 Apr  9  2018  EITlab_file.gdoc
-rw------- 1 root root 499M Oct 26 14:27  eLMO.h5
-rw------- 1 root root    1 Apr  9  2018  index.gdoc
-rw------- 1 root root  25K Aug  5 12:19  lab1.docx
-rw------- 1 root root  275 Aug  5 12:19  LAB_1_GE.C
-rw------- 1 root root 3.6K Aug  5 12:19  Lab_1_px.PNG
drwx------ 2 root root 4.0K Sep  5  2017  LabWoRk
-rw------- 1 root root 167M Oct 26 13:28  model96.h5
-rw------- 1 root root 6.0K Oct 26 11:55  model96.hdf5
-rw------- 1 root root 3.9K Oct 26 13:28  model96.json
-rw------- 1 root root    1 Apr 18  2018 'Personal_Information_Proforma - Copy.gdoc'
-rw------- 1 root root 1.9K Oc

In [50]:
from keras.models import load_model

model = load_model('drive/My Drive/eLMO.h5')

NameError: ignored