In [1]:
import numpy as np
import pandas as pd
import re
import pickle
import nltk
import string

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical


In [2]:
def remove_html(text):
    soup = BeautifulSoup(text, 'lxml')
    html_free = soup.get_text()
    return html_free

def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation and c not in string.digits])
    return no_punct

def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [3]:
df = pd.read_csv('uw.csv')

In [4]:
df['clean'] = df['Input'].apply(lambda x: remove_punctuation(x))
df['clean']

0                         screams in  different languages
1       Families to sue over Legionnaires More than  f...
2       Pandemonium In Aba As Woman Delivers Baby With...
3       My emotions are a train wreck My body is a tra...
4       Alton brown just did a livestream and he burne...
                              ...                        
1859    Trollkrattos Juan Carlos Salvador The Secret T...
1860    devonbreneman hopefully it doesnt electrocute ...
1861    Businesses are deluged with invokces Make your...
1862    BREAKING  police officers arrested for abusing...
1863    News Refugio oil spill may have been costlier ...
Name: clean, Length: 1864, dtype: object

In [5]:
df['clean'] = df['clean'].apply(lambda x: remove_html(x))

In [6]:
tokenizer1 = RegexpTokenizer(r'\w+')

In [7]:
df['clean'] = df['clean'].apply(lambda x: tokenizer1.tokenize(x.lower()))
df['clean']

0                     [screams, in, different, languages]
1       [families, to, sue, over, legionnaires, more, ...
2       [pandemonium, in, aba, as, woman, delivers, ba...
3       [my, emotions, are, a, train, wreck, my, body,...
4       [alton, brown, just, did, a, livestream, and, ...
                              ...                        
1859    [trollkrattos, juan, carlos, salvador, the, se...
1860    [devonbreneman, hopefully, it, doesnt, electro...
1861    [businesses, are, deluged, with, invokces, mak...
1862    [breaking, police, officers, arrested, for, ab...
1863    [news, refugio, oil, spill, may, have, been, c...
Name: clean, Length: 1864, dtype: object

In [8]:
df['clean'] = df['clean'].apply(lambda x: remove_stopwords(x))
df['clean']

0                         [screams, different, languages]
1       [families, sue, legionnaires, families, affect...
2       [pandemonium, aba, woman, delivers, baby, with...
3       [emotions, train, wreck, body, train, wreck, i...
4       [alton, brown, livestream, burned, butter, tou...
                              ...                        
1859    [trollkrattos, juan, carlos, salvador, secret,...
1860    [devonbreneman, hopefully, doesnt, electrocute...
1861    [businesses, deluged, invokces, make, stand, c...
1862    [breaking, police, officers, arrested, abusing...
1863    [news, refugio, oil, spill, may, costlier, big...
Name: clean, Length: 1864, dtype: object

In [9]:
tokenizer = Tokenizer(num_words=5000,lower=True,split=' ')

In [10]:
tokenizer.fit_on_texts(df['Input'])

In [11]:
X = tokenizer.texts_to_sequences(df['Input'])
X = pad_sequences(X,maxlen=500)
Y = df['Validation']
vocab_size = len(tokenizer.word_index) + 1

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 24)

In [13]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    le.fit(y_test)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    
    return y_train_enc, y_test_enc

In [15]:
y_train,y_test = prepare_targets(Y_train,Y_test)

In [16]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=500))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           431850    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               183296    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 615,403
Trainable params: 615,403
Non-trainable params: 0
_________________________________________________________________


In [21]:
#es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
history=model.fit(X_train, y_train, batch_size=128, epochs=6, validation_data=[X_test, y_test])

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [22]:
news = 'ten killed and hundreds wounded in the earthquake'
x_1=tokenizer.texts_to_sequences([news])
x_1 = pad_sequences(x_1,maxlen=500)
model.predict(x_1)
#the output is closer to 1 for news
#and closer to zero for not news

array([[0.94667554]], dtype=float32)

In [23]:
news2 = 'going to the beach this weekend'
x_2=tokenizer.texts_to_sequences([news2])
x_2 = pad_sequences(x_2,maxlen=500)
model.predict(x_2)
#the output is closer to 1 for news
#and closer to zero for not news

array([[0.01637271]], dtype=float32)

In [24]:
model.save('news.h5')