In [1]:
import random
import tensorflow as tf
import numpy as np
import os
import re
import tensorflow_hub as hub


from keras.layers import Layer
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.models import Sequential
from keras.models import Model
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer

import spacy

Using TensorFlow backend.


#### Initialize Session

In [2]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

sess = tf.compat.v1.Session(config=config)
sess.run(tf.global_variables_initializer())
K.set_session(sess)

#### Initialize Language Model

In [5]:
nlp = spacy.load('en')

### Prepare Data

In [3]:
data = pd.read_csv('dataset/data/articles_dataset.csv')
data = data[~data['content'].isnull()]
data.head()

Unnamed: 0,title,content,link,source,class
0,Tibit Communications Raises $20M in Series B F...,"Tibit Communications, Inc., a Petaluma, CA-bas...",http://www.finsmes.com/2019/04/tibit-communica...,FinsmesUSA,Funding
1,Twitter blames human error after blocking a Ne...,"Over the holiday weekend, The New York Times f...",https://techcrunch.com/2017/11/27/twitter-blam...,techcrunch,Other
2,SimplyCook Raises £4.5M in Series A Funding\n,"SimplyCook, a London, UK-based recipe kit serv...",http://www.finsmes.com/2019/01/simplycook-rais...,FinsmesUK,Funding
3,Moogsoft Secures $40M in Series D Funding\n,"Moogsoft, a San Francisco, CA-based provider o...",http://www.finsmes.com/2018/03/moogsoft-secure...,FinsmesUSA,Funding
4,Zeta Global acquires commenting service†Disqus,A source close to the two companies tells us t...,https://techcrunch.com/2017/12/05/zeta-global-...,techcrunch,Other


In [18]:
def replace_entities(text):
    doc = nlp(text)
    organizations = [ent.text for ent in doc.ents if ent.label_=='ORG']
    moneys = [ent.text for ent in doc.ents if ent.label_=='MONEY']
    people = [ent.text for ent in doc.ents if ent.label_=='PERSON']
    
    for org in organizations:
        text = text.replace(org, 'organization')
        
    for money in moneys:
        text = text.replace(money, 'money')
        
    for person in people:
        text = text.replace(person, 'person')
    
    return text

In [20]:
data['modified_content'] = data['content'].map(replace_entities)
data.head()

Unnamed: 0,title,content,link,source,class,modified_content
0,Tibit Communications Raises $20M in Series B F...,"Tibit Communications, Inc., a Petaluma, CA-bas...",http://www.finsmes.com/2019/04/tibit-communica...,FinsmesUSA,Funding,"organization, a Petaluma, organization-based s..."
1,Twitter blames human error after blocking a Ne...,"Over the holiday weekend, The New York Times f...",https://techcrunch.com/2017/11/27/twitter-blam...,techcrunch,Other,"Over the holiday weekend, organization found t..."
2,SimplyCook Raises £4.5M in Series A Funding\n,"SimplyCook, a London, UK-based recipe kit serv...",http://www.finsmes.com/2019/01/simplycook-rais...,FinsmesUK,Funding,"organization, a London, UK-based recipe kit se..."
3,Moogsoft Secures $40M in Series D Funding\n,"Moogsoft, a San Francisco, CA-based provider o...",http://www.finsmes.com/2018/03/moogsoft-secure...,FinsmesUSA,Funding,"organization, a San Francisco, organization-ba..."
4,Zeta Global acquires commenting service†Disqus,A source close to the two companies tells us t...,https://techcrunch.com/2017/12/05/zeta-global-...,techcrunch,Other,A source close to the two companies tells us t...


In [29]:
data.to_csv('dataset/data/ner_articles_dataset.csv')

In [30]:
articles = data['modified_content'].values
titles = data['title'].values
labels = data['class'].values

In [31]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)


def encode_labels(labels):
    encoded = label_encoder.transform(labels)
    return encoded

def decode_labels(encoded_labels):
    labels_type = type(encoded_labels)
    
    if labels_type == int:
        try:
            label = label_encoder.inverse_transform([encoded_labels])
        except ValueError:
            print('Unknown value')
            return np.nan
        
        return label
    
    elif hasattr(encoded_labels, '__iter__') and labels_type != str:
        try:
            labels = label_encoder.inverse_transform(encoded_labels)
            
        except ValueError:
            print('Unknown value')
            raise
            
        return labels
    else:
        raise TypeError

#### Reduce articles to 150 words

In [32]:
articles = np.array([' '.join(article.split()[:150]) for article in articles])

#### Create training, dev and test data

In [33]:
data_size = len(articles)

train_size = round(0.8 * data_size)
dev_size = round(0.1 * data_size)

dev_end = train_size + dev_size

print(train_size, dev_size, dev_end)

35225 4403 39628


In [34]:
shuffle = np.random.permutation(len(articles))
articles = articles[shuffle]
titles = titles[shuffle]
labels = labels[shuffle]

articles_train, labels_train, titles_train = articles[:train_size], labels[:train_size], titles[:train_size]
articles_dev, labels_dev, titles_dev = articles[train_size:dev_end], labels[train_size:dev_end], titles[train_size:dev_end]
articles_test, labels_test, titles_test = articles[dev_end:], labels[dev_end:], titles[dev_end:]

### Model Building
#### Create Elmo Layer

In [35]:
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['elmo']
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], 48, self.dimensions)

In [36]:
def build_model(): 
    input_text = layers.Input(shape=(1,), dtype="string")
    embedding = ElmoEmbeddingLayer()(input_text)
    lstm = layers.Bidirectional(layers.LSTM(512))(embedding)
    dense = layers.Dense(128, activation='relu')(lstm)
    pred = layers.Dense(1, activation='sigmoid')(dense)

    model = Model(inputs=[input_text], outputs=pred)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

In [37]:
model = build_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_2 (Elmo (None, 48, 1024)          4         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1024)              6295552   
_________________________________________________________________
dense_3 (Dense)              (None, 128)               131200    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 6,426,885
Trainable params: 6,426,885
Non-trainable params: 0
_________________________________________________________________


In [38]:
### Early Stopping
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

In [39]:
# Build and fit
model.reset_states()
history = model.fit(articles_train, labels_train,
                  validation_data=(articles_dev, labels_dev),
                    epochs=10, batch_size=128, callbacks=[early_stop])

W0803 20:54:11.576054 140196501894976 deprecation_wrapper.py:119] From /home/brianmusisi/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 35225 samples, validate on 4403 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: early stopping


In [40]:
loss, accuracy = model.evaluate(articles_test, labels_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.9155


### LSTM

In [3]:
data = pd.read_csv('dataset/data/ner_articles_dataset.csv')

In [4]:
articles = data['modified_content'].values
titles = data['title'].values
labels = data['class'].values

In [5]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

In [6]:
articles = np.array([' '.join(article.split()[:150]) for article in articles])

In [7]:
data_size = len(articles)

train_size = round(0.8 * data_size)
dev_size = round(0.1 * data_size)

dev_end = train_size + dev_size

print(train_size, dev_size, dev_end)

35225 4403 39628


In [8]:
shuffle = np.random.permutation(len(articles))
articles = articles[shuffle]
titles = titles[shuffle]
labels = labels[shuffle]

articles_train, labels_train, titles_train = articles[:train_size], labels[:train_size], titles[:train_size]
articles_dev, labels_dev, titles_dev = articles[train_size:dev_end], labels[train_size:dev_end], titles[train_size:dev_end]
articles_test, labels_test, titles_test = articles[dev_end:], labels[dev_end:], titles[dev_end:]

In [9]:
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['elmo']
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], 48, self.dimensions)

In [10]:
def build_model(): 
    input_text = layers.Input(shape=(1,), dtype="string")
    embedding = ElmoEmbeddingLayer()(input_text)
    lstm = layers.LSTM(512)(embedding)
    dense = layers.Dense(30, activation='relu')(lstm)
    pred = layers.Dense(1, activation='sigmoid')(dense)

    model = Model(inputs=[input_text], outputs=pred)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

In [12]:
model = build_model()

W0803 23:04:44.124125 139833800501056 deprecation_wrapper.py:119] From /home/brianmusisi/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0803 23:04:44.127584 139833800501056 deprecation_wrapper.py:119] From /home/brianmusisi/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0803 23:04:45.351426 139833800501056 deprecation_wrapper.py:119] From /home/brianmusisi/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0803 23:04:46.483433 139833800501056 deprecation_wrapper.py:119] From /home/brianmusisi/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimize

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_1 (Elmo (None, 48, 1024)          4         
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               3147776   
_________________________________________________________________
dense_1 (Dense)              (None, 30)                15390     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 31        
Total params: 3,163,201
Trainable params: 3,163,201
Non-trainable params: 0
_________________________________________________________________


In [13]:
### Early Stopping
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

In [14]:
# Build and fit
model.reset_states()
history = model.fit(articles_train, labels_train,
                  validation_data=(articles_dev, labels_dev),
                    epochs=10, batch_size=128, callbacks=[early_stop])

W0803 23:05:17.745397 139833800501056 deprecation_wrapper.py:119] From /home/brianmusisi/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 35225 samples, validate on 4403 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: early stopping


In [15]:
loss, accuracy = model.evaluate(articles_test, labels_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.9121
