In [1]:
import re
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization, LSTM, Embedding, Reshape
from keras.models import load_model, model_from_json
import pickle


Using TensorFlow backend.


In [3]:
from keras.engine.topology import Layer
import keras.backend as K
from keras import initializers
import numpy as np

class Embedding2(Layer):

    def __init__(self, input_dim, output_dim, fixed_weights, embeddings_initializer='uniform', 
                 input_length=None, **kwargs):
        kwargs['dtype'] = 'int32'
        if 'input_shape' not in kwargs:
            if input_length:
                kwargs['input_shape'] = (input_length,)
            else:
                kwargs['input_shape'] = (None,)
        super(Embedding2, self).__init__(**kwargs)
    
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.embeddings_initializer = embeddings_initializer
        self.fixed_weights = fixed_weights
        self.num_trainable = input_dim - len(fixed_weights)
        self.input_length = input_length
        
        w_mean = fixed_weights.mean(axis=0)
        w_std = fixed_weights.std(axis=0)
        self.variable_weights = w_mean + w_std*np.random.randn(self.num_trainable, output_dim)

    def build(self, input_shape, name='embeddings'):        
        fixed_weight = K.variable(self.fixed_weights, name=name+'_fixed')
        variable_weight = K.variable(self.variable_weights, name=name+'_var')
        
        self._trainable_weights.append(variable_weight)
        self._non_trainable_weights.append(fixed_weight)
        
        self.embeddings = K.concatenate([fixed_weight, variable_weight], axis=0)
        
        self.built = True

    def call(self, inputs):
        if K.dtype(inputs) != 'int32':
            inputs = K.cast(inputs, 'int32')
        out = K.gather(self.embeddings, inputs)
        return out

    def compute_output_shape(self, input_shape):
        if not self.input_length:
            input_length = input_shape[1]
        else:
            input_length = self.input_length
        return (input_shape[0], input_length, self.output_dim)


In [4]:
'''
df = pd.read_csv('data/clean_actual_data_all_news.csv')
df.columns = ['title','text','label']
df.title = df.title.str.lower()
df.text = df.text.str.lower()

df.title = df.title.str.replace(r'http[\w:/\.]+','<URL>') # remove urls
df.text = df.text.str.replace(r'http[\w:/\.]+','<URL>') # remove urls
df.title = df.title.str.replace(r'[^\.\w\s]','') #remove everything but characters and punctuation
df.text = df.text.str.replace(r'[^\.\w\s]','') #remove everything but characters and punctuation
df.title = df.title.str.replace(r'\.\.+','.') #replace multple periods with a single one
df.text = df.text.str.replace(r'\.\.+','.') #replace multple periods with a single one
df.title = df.title.str.replace(r'\.',' . ') #replace periods with a single one
df.text = df.text.str.replace(r'\.',' . ') #replace multple periods with a single one
df.title = df.title.str.replace(r'\s\s+',' ') #replace multple white space with a single one
df.text = df.text.str.replace(r'\s\s+',' ') #replace multple white space with a single one
df.title = df.title.str.strip() 
df.text = df.text.str.strip() 
print(df.shape)
df.head()
'''
title = title.str.lower()
text = text.str.lower()

title = title.str.replace(r'http[\w:/\.]+','<URL>') # remove urls
text = text.str.replace(r'http[\w:/\.]+','<URL>') # remove urls
title = title.str.replace(r'[^\.\w\s]','') #remove everything but characters and punctuation
text = text.str.replace(r'[^\.\w\s]','') #remove everything but characters and punctuation
title = title.str.replace(r'\.\.+','.') #replace multple periods with a single one
text = text.str.replace(r'\.\.+','.') #replace multple periods with a single one
title = title.str.replace(r'\.',' . ') #replace periods with a single one
text = text.str.replace(r'\.',' . ') #replace multple periods with a single one
title = title.str.replace(r'\s\s+',' ') #replace multple white space with a single one
text = text.str.replace(r'\s\s+',' ') #replace multple white space with a single one
title = title.str.strip() 
text = text.str.strip() 



(24319, 3)


Unnamed: 0,title,text,label
0,english businesses should be forced to show hy...,all premises in england that sell food should ...,REAL
1,tips should go to workers not employers says g...,tips in restaurants hotels and bars should go ...,REAL
2,sturgeon accused of breaking independence prom...,election debates are more common in holyrood p...,REAL
3,second scottish independence vote not yet on t...,nicola sturgeon has said a second independence...,REAL
4,bernie sanders vows a contested convention des...,bernie sanders acknowledged an uphill climb ah...,REAL


In [14]:

model = Sequential()
model.add(Embedding2(len(word2num), 50,
                    fixed_weights=np.array([word2glove[w] for w in words_in_glove]))) # , batch_size=batch_size
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding2_1 (Embedding2)    (None, None, 50)          5486900   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 5,516,405
Trainable params: 420,205
Non-trainable params: 5,096,200
_________________________________________________________________


In [16]:
model.load_weights("rmsprop_guardian_kaggle_data_trained_model.h5")
fp = open("word2num_processed.pkl")
word2num = pickle.load(fp)

In [None]:
sentence = text.lower()
sentence_num = [word2num[w] if w in word2num else word2num['<Other>'] for w in sentence.split()]
sentence_num = [word2num['<PAD>']]*(0) + sentence_num
sentence_num = np.array(sentence_num)
model.predict(sentence_num[None,:])