# LSTM - Article Classification

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import base64
import string
import re
from collections import Counter
from time import time
from nltk.corpus import stopwords
from sklearn.metrics import log_loss
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Data and Preprocessing

In [12]:
df = pd.read_csv('/Users/briankalinowski/Desktop/Data/news_content_lemma.csv')
print(df.shape)
df.head()

(26227, 6)


Unnamed: 0,title,text,tokenized_headline,tokenized_content,type,valid_score
0,Muslims BUSTED They Stole Millions In Govt Ben...,Print They should pay all the back all the mon...,muslims bust steal millions in govt benefit,print should pay all the back all the money pl...,bias,0
1,Re Why Did Attorney General Loretta Lynch Plea...,Why Did Attorney General Loretta Lynch Plead T...,re why do attorney general loretta lynch plead...,why do attorney general loretta lynch plead th...,bias,0
2,BREAKING Weiner Cooperating With FBI On Hillar...,Red State Fox News Sunday reported this mornin...,break weiner cooperate with fbi on hillary ema...,red state fox news sunday report this morning ...,bias,0
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,pin drop speech by father of daughter kidnappe...,email kayla mueller be a prisoner and torture ...,bias,0
4,FANTASTIC! TRUMPS 7 POINT PLAN To Reform Healt...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,fantastic trump 7 point plan to reform healthc...,email healthcare reform to make america great ...,bias,0


In [3]:
df['title'] = [str(art) for art in df.title]
df['text'] = [str(art) for art in df.text]
df['tokenized_headline'] = [str(art) for art in df.tokenized_headline]
df['tokenized_content'] = [str(art) for art in df.tokenized_content]

df.head()

Unnamed: 0,title,text,tokenized_headline,tokenized_content,type,valid_score
0,Muslims BUSTED They Stole Millions In Govt Ben...,Print They should pay all the back all the mon...,muslims bust steal millions in govt benefit,print should pay all the back all the money pl...,bias,0
1,Re Why Did Attorney General Loretta Lynch Plea...,Why Did Attorney General Loretta Lynch Plead T...,re why do attorney general loretta lynch plead...,why do attorney general loretta lynch plead th...,bias,0
2,BREAKING Weiner Cooperating With FBI On Hillar...,Red State Fox News Sunday reported this mornin...,break weiner cooperate with fbi on hillary ema...,red state fox news sunday report this morning ...,bias,0
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,pin drop speech by father of daughter kidnappe...,email kayla mueller be a prisoner and torture ...,bias,0
4,FANTASTIC! TRUMPS 7 POINT PLAN To Reform Healt...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,fantastic trump 7 point plan to reform healthc...,email healthcare reform to make america great ...,bias,0


## Create Train / Test Sets

### Title

In [4]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 600
EMBEDDING_DIM = 100

In [5]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['title'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 25978 unique tokens.


In [6]:
X = tokenizer.texts_to_sequences(df['title'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (26227, 600)


In [7]:
Y = pd.get_dummies(df['valid_score']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (26227, 2)


In [8]:
X_title_train, X_title_test, Y_title_train, Y_title_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_title_train.shape,Y_title_train.shape)
print(X_title_test.shape,Y_title_test.shape)

(23604, 600) (23604, 2)
(2623, 600) (2623, 2)


### Content

In [3]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 600
EMBEDDING_DIM = 100

In [4]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 180034 unique tokens.


In [5]:
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (26227, 600)


In [6]:
Y = pd.get_dummies(df['valid_score']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (26227, 2)


In [7]:
X_text_train, X_text_test, Y_text_train, Y_text_test = train_test_split(X, Y, test_size = 0.50, random_state = 21)
print(X_text_train.shape,Y_text_train.shape)
print(X_text_test.shape,Y_text_test.shape)

(13113, 600) (13113, 2)
(13114, 600) (13114, 2)


## Model Definition and Training

In [10]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 600, 100)          5000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 600, 100)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                42240     
_________________________________________________________________
dense (Dense)                (None, 2)                 130       
Total params: 5,042,370
Trainable params: 5,042,370
Non-trainable params: 0
_________________________________________________________________
None


### Title

In [15]:
epochs = 10
batch_size = 64

history = model.fit(X_title_train, Y_title_train, 
                    epochs=epochs, 
                    batch_size=batch_size,
                    validation_data=(X_title_test, Y_title_test),
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Train on 23604 samples, validate on 2623 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [16]:
Y_title_pred = model.predict(X_title_test)

### Content

In [11]:
epochs = 10
batch_size = 64

history = model.fit(X_text_train, Y_text_train, 
                    epochs=epochs, batch_size=batch_size,
                    validation_data=(X_text_test, Y_text_test),
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Train on 13113 samples, validate on 13114 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [12]:
Y_text_pred = model.predict(X_text_test)

### Save Score Data Frame

#### Score Functions

In [13]:
def scores_abs(row):
    return abs((row.REAL - row.FAKE))

def fake_weighted(row):
    return row.FAKE / (row.REAL + row.FAKE)

def real_weighted(row):
    return row.REAL / (row.REAL + row.FAKE)

def assign_valid_class(row):
    # sum of the REAL and Fake scores (they are not probabilities)
    score_sum = (row.REAL + row.FAKE)

    # divide each score by their sum for weighted probabilities
    weighted_real = row.REAL / score_sum
    weighted_fake = row.FAKE / score_sum

    if (weighted_real > weighted_fake) and (row.REAL >= 0.5) and (row.FAKE < 0.5):
        valid_class = 1
    elif (weighted_real < weighted_fake) and (row.FAKE >= 0.5) and (row.REAL < 0.5):
        valid_class = 0
    else:
        # just default to the raw scores
        if row.REAL > row.FAKE:
            valid_class = 1
        else:
            valid_class = 0
    return valid_class

In [20]:
lstm_title_df = pd.DataFrame() #creates a new dataframe that's empty
lstm_title_df['valid_score'] = np.argmax(Y_title_test, axis=1)
lstm_title_df['REAL'] = Y_title_pred[:,1]
lstm_title_df['FAKE'] = Y_title_pred[:,0]

lstm_title_df['real_weighted_score'] = lstm_title_df.apply(real_weighted, axis=1)
lstm_title_df['fake_weighted_score'] = lstm_title_df.apply(fake_weighted, axis=1)
lstm_title_df['score_abs'] = lstm_title_df.apply(scores_abs, axis=1)
lstm_title_df['valid_prediction'] = lstm_title_df.apply(assign_valid_class, axis=1)

lstm_title_df.to_csv(path_or_buf="lstm_title_df.csv", header=True, index=None)

lstm_title_df

Unnamed: 0,valid_score,REAL,FAKE,real_weighted_score,fake_weighted_score,score_abs,valid_prediction
0,1,0.485007,0.514993,0.485007,0.514993,0.029986,0
1,0,0.016317,0.983683,0.016317,0.983683,0.967365,0
2,1,0.692342,0.307658,0.692342,0.307658,0.384684,1
3,1,0.006205,0.993795,0.006205,0.993795,0.987590,0
4,1,0.980387,0.019613,0.980387,0.019613,0.960775,1
...,...,...,...,...,...,...,...
2618,1,0.999973,0.000027,0.999973,0.000027,0.999946,1
2619,1,0.999781,0.000219,0.999781,0.000219,0.999562,1
2620,1,0.015301,0.984699,0.015301,0.984699,0.969399,0
2621,0,0.438745,0.561255,0.438745,0.561255,0.122511,0


In [15]:
lstm_text_df = pd.DataFrame() #creates a new dataframe that's empty
lstm_text_df['valid_score'] = np.argmax(Y_text_test, axis=1)
lstm_text_df['REAL'] = Y_text_pred[:,1]
lstm_text_df['FAKE'] = Y_text_pred[:,0]

lstm_text_df['real_weighted_score'] = lstm_text_df.apply(real_weighted, axis=1)
lstm_text_df['fake_weighted_score'] = lstm_text_df.apply(fake_weighted, axis=1)
lstm_text_df['score_abs'] = lstm_text_df.apply(scores_abs, axis=1)
lstm_text_df['valid_prediction'] = lstm_text_df.apply(assign_valid_class, axis=1)

lstm_text_df.to_csv(path_or_buf="/Users/briankalinowski/Desktop/Data/lstm_text_large_df.csv", header=True, index=None)
lstm_text_df.head()

Unnamed: 0,valid_score,REAL,FAKE,real_weighted_score,fake_weighted_score,score_abs,valid_prediction
0,0,0.017029,0.982971,0.017029,0.982971,0.965942,0
1,1,0.975015,0.024985,0.975015,0.024985,0.95003,1
2,1,0.94672,0.05328,0.94672,0.05328,0.89344,1
3,0,0.000204,0.999796,0.000204,0.999796,0.999591,0
4,1,0.996636,0.003364,0.996636,0.003364,0.993272,1
