In [1]:
import numpy as np
import pandas as pd
import re
import pickle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical


In [2]:
def remove_special_chars(tweets):
    for remove in map(lambda r: re.compile(re.escape(r)), [',', ':', '\'', '=', '&', ';', '%', '$',
                                                            '@', '%', '^', '*', '(',')', '{','}',
                                                            '[',' ]', '|', '/', '\\', '>', '<', '-',
                                                            '!', '?', '.', "'", ' — ', ' — -', '#']):
         tweets.replace(remove, '', inplace=True)
    return tweets
                                                           
def remove_tags(text):
     return re.compile(r'<[^>]+>').sub('', text)

def remove_num(text):
     return ''.join(re.sub(r'([0–9]+)','',text))
                                                           


In [3]:
data = pd.read_csv('uw.csv', encoding='mac_roman')

In [4]:
data.Input=data.Input.apply(lambda x : remove_tags(x))

In [5]:
data.Input=data.Input.apply(lambda x : remove_num(x))

In [6]:
remove_special_chars(data.Input)


0                       screams in 25 different languages
1       Families to sue over Legionnaires More than 4 ...
2       Pandemonium In Aba As Woman Delivers Baby With...
3       My emotions are a train wreck My body is a tra...
4       Alton brown just did a livestream and he burne...
                              ...                        
1859    Trollkrattos Juan Carlos Salvador The Secret T...
1860    devon_breneman hopefully it doesnt electrocute...
1861    Businesses are deluged with invokces Make your...
1862    BREAKING411 4 police officers arrested for abu...
1863    News Refugio oil spill may have been costlier ...
Name: Input, Length: 1864, dtype: object

In [7]:
data

Unnamed: 0,Validation,Input,Unnamed: 2,Unnamed: 3
0,0,screams in 25 different languages,,
1,1,Families to sue over Legionnaires More than 4 ...,,
2,1,Pandemonium In Aba As Woman Delivers Baby With...,,
3,0,My emotions are a train wreck My body is a tra...,,
4,0,Alton brown just did a livestream and he burne...,,
...,...,...,...,...
1859,0,Trollkrattos Juan Carlos Salvador The Secret T...,,
1860,0,devon_breneman hopefully it doesnt electrocute...,,
1861,0,Businesses are deluged with invokces Make your...,,
1862,1,BREAKING411 4 police officers arrested for abu...,,


In [8]:
tokenizer = Tokenizer(num_words=5000,lower=True,split=' ')

In [9]:
tokenizer.fit_on_texts(data['Input'])

In [10]:
X = tokenizer.texts_to_sequences(data['Input'])
X = pad_sequences(X,maxlen=500)
Y = data['Validation']
vocab_size = len(tokenizer.word_index) + 1

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 24)

In [12]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    le.fit(y_test)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    
    return y_train_enc, y_test_enc

In [14]:
y_train,y_test = prepare_targets(Y_train,Y_test)

In [15]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=500))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           428800    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               183296    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 612,353
Trainable params: 612,353
Non-trainable params: 0
_________________________________________________________________


In [20]:
#es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
history=model.fit(X_train, y_train, batch_size=128, epochs=6, validation_data=[X_test, y_test])
#, callbacks=[es]

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [23]:
model.save('news_sub.h5')

In [21]:
news = 'ten killed and hundreds wounded in the earthquake'
x_1=tokenizer.texts_to_sequences([news])
x_1 = pad_sequences(x_1,maxlen=500)
model.predict(x_1)
#the output is closer to 1 for news
#and closer to zero for not news

array([[0.90072036]], dtype=float32)

In [22]:
news2 = 'going to the beach this weekend'
x_2=tokenizer.texts_to_sequences([news2])
x_2 = pad_sequences(x_2,maxlen=500)
model.predict(x_2)
#the output is closer to 1 for news
#and closer to zero for not news

array([[0.07486376]], dtype=float32)