In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Import data

In [2]:
data = pd.read_csv('data/dataset_3/train/train_data.csv')
data.shape

(20630, 4)

In [3]:
data.head(5)

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
data['label'].value_counts()

0    10343
1    10287
Name: label, dtype: int64

In [5]:
data['author'].value_counts().head(10)

Pam Key                243
admin                  193
Jerome Hudson          166
Charlie Spiering       141
John Hayward           140
Katherine Rodriguez    124
Warner Todd Huston     122
Ian Hanchett           119
Breitbart News         118
Daniel Nussbaum        112
Name: author, dtype: int64

### Text Preprocessing

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
# Remove empty (NaN) values with na in dataset
data = data.replace(np.nan, 'na', regex=True)

In [9]:
data.head(5)

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


#### Concatenate feature inputs

In [10]:
frames = [data['title'],data['author'],data['text']]
new_data = pd.Series.to_frame(pd.concat(frames))
new_data['label'] = data['label']
new_data[:1]

Unnamed: 0,0,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1


In [11]:
data['combined_input'] = data['author'].map(str) + data['title'].map(str)+data['text'].map(str)

In [12]:
# Split data into train and test datasets
# We have around 20630 entries, of which 20% will be used for testing and the rest will be used for training and validataion
# We will be using k-flod cross validation, therefore we do not need to split train data further into two parts as our k-flod CV
# will do that for us
seed = 9
np.random.seed(seed)
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = seed)

In [13]:
train_data.shape, test_data.shape

((16504, 5), (4126, 5))

In [14]:
# This will be random every time we split our data
train_data[:5]

Unnamed: 0,title,author,text,label,combined_input
11762,The French Fear Islamization but Do Nothing,Guillaume Durocher,"Posted on November 4, 2016 The French Fear Isl...",1,Guillaume DurocherThe French Fear Islamization...
2179,What We Just Witnessed Has Rarely Occurred In ...,King World News,32 King World News \nOn the heels of yesterd...,1,King World NewsWhat We Just Witnessed Has Rare...
11951,Flight Attendants Fight Human Trafficking With...,Jacey Fortin,"Donna Hubbard, a flight attendant who lives ou...",0,Jacey FortinFlight Attendants Fight Human Traf...
19925,"Rockefeller Foundation Picks Rajiv J. Shah, a ...",David Gelles,"Rajiv J. Shah, a trustee of the Rockefeller Fo...",0,David GellesRockefeller Foundation Picks Rajiv...
10052,Carl Bernstein: Hillary Scandals Not In The “S...,James Fulford,,1,James FulfordCarl Bernstein: Hillary Scandals ...


In [15]:
test_data[:5]

Unnamed: 0,title,author,text,label,combined_input
2277,Fifth Mexican Journalist Murdered in 90 Days,Ildefonso Ortiz and Brandon Darby,Suspected cartel gunmen killed another journ...,0,Ildefonso Ortiz and Brandon DarbyFifth Mexican...
8066,"Suburban Chicago School Teaches ’Blackenomics,...",Warner Todd Huston,A suburban Chicago high school is taking the O...,0,Warner Todd HustonSuburban Chicago School Teac...
16081,John Podesta’s Sister-in-Law Lobbied For Rayth...,Michael Krieger,at 11:08 am 1 Comment \nThe Podesta family see...,1,Michael KriegerJohn Podesta’s Sister-in-Law Lo...
16560,"Review: Graham, Cunningham and Taylor, All Tog...",Alastair Macaulay,"The triple bill of dances by Martha Graham, Me...",0,"Alastair MacaulayReview: Graham, Cunningham an..."
5565,"Ashton Kutcher Rescues 6,000 Sex Trafficking V...",Amando Flavio,Christopher Ashton Kutcher is a well-known fig...,1,"Amando FlavioAshton Kutcher Rescues 6,000 Sex ..."


#### Train features and targets

In [16]:
X_train, y_train = train_data.iloc[:,4].values, train_data.iloc[:,3].values

In [17]:
X_train.shape, y_train.shape

((16504,), (16504,))

In [18]:
X_train[:1]

      dtype=object)

In [19]:
y_train[:1]

array([1], dtype=int64)

#### Test features and targets

In [20]:
X_test, y_test = test_data.iloc[:,4].values, test_data.iloc[:,3].values

In [21]:
X_test.shape, y_test.shape

((4126,), (4126,))

In [22]:
X_test[:1]

array(['Ildefonso Ortiz and Brandon DarbyFifth Mexican Journalist Murdered in 90 DaysSuspected cartel gunmen killed another   journalist. This year, reporters exposing drug cartels and their ties to Mexican politicians have become targets with five murders taking place in 2017. [Mexico’s Rio Doce confirmed the murder of its founder, Javier Valdez, an   investigator and author who had been reporting on Mexico’s organized crime. Valdez was driving a red Toyota Corolla along a city street in Culiacan, Sinaloa, when unidentified gunmen shot him, Rio Doce reported. The local print weekly and online publication is one of the few news outlets that continues to carry out   investigations in Mexico exposing the deep ties between Mexican politicians and drug cartels.   Valdez’s murder comes just weeks after cartel gunmen murdered respected journalist Maximino Rodriguez Palacios in Baja California Sur as he drove with his wife to a shopping center, Breitbart Texas reported. The murder remains uns

In [23]:
y_test[:1]

array([0], dtype=int64)

In [24]:
import re
# from bs4 import BeautifulSoup

In [25]:
"""
REMOVE_STOPWORDS truncates stopwrds from the string and returns modified string
    INPUT:
        >> string
    OUTPUT:
        >> Modified string without stopwords
"""
def remove_stopwords(text):
    text = [word for word in text.split() if word not in STOPWORDS]
    text = ' '.join(text)
    return text

In [26]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
     # convert all characters in a string to lowercase
    text = text.lower()
    
    #replace Symbols with a space in string
    text = re.sub(REPLACE_BY_SPACE_RE, " ",text)
    
    # delete unwanted synbols from string
    text = re.sub(BAD_SYMBOLS_RE,"", text)
    
    # delete stopwords from text
    text = remove_stopwords(text)
    
    return text

In [27]:
X_train = [text_prepare(x) for x in X_train]

In [28]:
X_test = [text_prepare(x) for x in X_test]

In [29]:
X_train[:1]



### BOW Approach

In [30]:
# Word Frequency calculator
def word_frequency(data):
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    words = {}
    for line in data:
        tokens = tokenizer.tokenize(line)
        for token in tokens:
            if token in words:
                count = words[token] + 1
                words[token] = count
            else:
                words[token] =  1
                
    return words

In [31]:
# Dictionary of all words from train corpus with their counts.
words_counts = word_frequency(X_train)

In [32]:
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]
most_common_words

[('said', 62271),
 ('mr', 52541),
 ('trump', 35559),
 ('new', 28645),
 ('one', 28260),
 ('would', 27800),
 ('people', 26458),
 ('us', 19749),
 ('like', 19598),
 ('also', 19490)]

In [33]:
DICT_SIZE = 5000
WORDS_TO_INDEX = {b[0]:a for a,b in enumerate(sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:5000])}
INDEX_TO_WORDS = {b:a for a,b in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()

def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
      # create a zero vector equaling the size of words list
    result_vector = np.zeros(dict_size)
#     tokenizer = nltk.tokenize.WhitespaceTokenizer()
#     tokens = tokenizer.tokenize(text)
    for word in set(text.split()):
        if word in words_to_index:
            result_vector[words_to_index[word]] = 1
            
    return result_vector

In [34]:
from scipy import sparse as sp_sparse

In [35]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (16504, 5000)
X_test shape  (4126, 5000)


In [36]:
row = X_train_mybag[10].toarray()[0]
non_zero_elements_count = len(np.nonzero(row)[0])

In [37]:
non_zero_elements_count

97

In [38]:
X_train_mybag.toarray()

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.]])

### Simple ANN model

### Model 1

In [39]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Activation, Dropout, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

max_features = 6000
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(X_train)
list_tokenized_train = tokenizer.texts_to_sequences(X_train)

max_len = 500
X_train_ = pad_sequences(list_tokenized_train, maxlen=max_len)
y = y_train

embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation='relu'))
model.add(Dropout(0.05))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

batch_size = 100
epochs = 3
history = model.fit(X_train_,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 13203 samples, validate on 3301 samples
Epoch 1/3


InternalError: Blas GEMM launch failed : a.shape=(100, 32), b.shape=(32, 32), m=100, n=32, k=32
	 [[{{node bidirectional_1/while/MatMul_6}}]]
	 [[{{node metrics/acc/Mean_1}}]]

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

#### K-Fold Cross Validation

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
# Fixed random seed for reproducibility
seed = 7
np.random.seed(seed)

In [None]:
# define 10-fold cross validation test harness
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []

max_features = 6000
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(X_train)
list_tokenized_train = tokenizer.texts_to_sequences(X_train)

max_len = 500
X_t = pad_sequences(list_tokenized_train, maxlen=max_len)
y = y_train


In [None]:
embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation='relu'))
model.add(Dropout(0.05))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    
for train, test in kfold.split(X_t, y):
#     print(len(train),len(test))
    batch_size = 100
    epochs = 3
    model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)
    # evaluate the model
    scores = model.evaluate(X_t[train], y[train], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)

In [None]:
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

### Model 2

In [None]:
from keras.layers import MaxPool1D

In [None]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(X_train)
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
max_features = 50000
maxlen = 500

X_t = pad_sequences(list_tokenized_train, maxlen=max_len)
y = y_train

model_2 = Sequential()
model_2.add(Embedding(max_features, 8, input_length=maxlen))
model_2.add(MaxPool1D(pool_size=7, strides=None, padding='valid', data_format='channels_last'))
# model_2.add(MaxPool1D(pool_size=7, strides=None, padding='valid', data_format='channels_last'))
model_2.add(MaxPool1D(pool_size=3, strides=None, padding='valid', data_format='channels_last'))
model_2.add(MaxPool1D(pool_size=3, strides=None, padding='valid', data_format='channels_last'))
model_2.add(MaxPool1D(pool_size=3, strides=None, padding='valid', data_format='channels_last'))
# model_2.add(MaxPool1D(pool_size=3, strides=None, padding='valid', data_format='channels_last'))
model_2.add(Flatten())

model_2.add(Dense(32, activation='relu'))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model_2.summary()

batch_size_2 = 100
epochs_2 = 10
history_2 = model_2.fit(X_t, y, epochs=epochs_2, batch_size=batch_size_2, validation_split=0.2)

In [None]:
acc = history_2.history['acc']
val_acc = history_2.history['val_acc']
loss = history_2.history['loss']
val_loss = history_2.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

#### K-Fold Cross Validation

In [None]:
# Fixed random seed for reproducibility
seed = 7
np.random.seed(seed)

In [None]:
# define 10-fold cross validation test harness
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []

tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(X_train)
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
max_features = 50000
maxlen = 500

X_t = pad_sequences(list_tokenized_train, maxlen=max_len)
y = y_train


In [None]:
model_2 = Sequential()
model_2.add(Embedding(max_features, 8, input_length=maxlen))
model_2.add(MaxPool1D(pool_size=7, strides=None, padding='valid', data_format='channels_last'))
# model_2.add(MaxPool1D(pool_size=7, strides=None, padding='valid', data_format='channels_last'))
model_2.add(MaxPool1D(pool_size=3, strides=None, padding='valid', data_format='channels_last'))
model_2.add(MaxPool1D(pool_size=3, strides=None, padding='valid', data_format='channels_last'))
model_2.add(MaxPool1D(pool_size=3, strides=None, padding='valid', data_format='channels_last'))
# model_2.add(MaxPool1D(pool_size=3, strides=None, padding='valid', data_format='channels_last'))
model_2.add(Flatten())

model_2.add(Dense(32, activation='relu'))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model_2.summary()

for train, test in kfold.split(X_t, y):
    batch_size_2 = 100
    epochs_2 = 10
    model_2.fit(X_t, y, epochs=epochs_2, batch_size=batch_size_2, validation_split=0.2)
    # evaluate the model
    scores = model_2.evaluate(X_t[train], y[train], verbose=0)
    print("%s: %.2f%%" % (model_2.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)

In [None]:
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

In [None]:
# tokenizer = Tokenizer(num_words = max_features)
# tokenizer.fit_on_texts(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_train)

X_test = pad_sequences(list_tokenized_test, maxlen=max_len)
model_2_pred_cv = model_2.predict(X_test)

In [None]:
model_2_pred_cv

In [None]:
y_test.shape, X_test.shape

In [None]:
score, acc = model_2.evaluate(X_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)