In [157]:
import numpy as np
import pandas as pd
import re
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import chi2

import keras
import tensorflow
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Activation, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.utils import pad_sequences

from gensim.models import Word2Vec

import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
wnl = WordNetLemmatizer() # wnl.lemmatize(words)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\haven\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\haven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\haven\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [158]:
data = pd.read_csv("processed_data.csv") # shld be preprocessed data
data.head()

Unnamed: 0,title,text,class,text_without_stopwords,title_without_stopwords,text_word_count,title_word_count,text_sentence_count,title_sentence_count,text_average_word_length,...,polarity,overall_content,Topic 1 Probability,Topic 2 Probability,Topic 3 Probbility,Topic 4 Probability,Topic 5 Probability,polarity_category,polarity_category_Neutral,polarity_category_Positive
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,1,donald trump wish americans happy new year lea...,donald trump sends out embarrassing new year’s...,516,13,28,1,4.80404,...,0.082132,donald trump sends out embarrassing new year’s...,0.002194,0.747636,0.001007,0.15766,0.091503,Positive,0,1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1,house intelligence committee chairman devin nu...,drunk bragging trump staffer started russian c...,309,9,11,1,5.213115,...,-0.005004,drunk bragging trump staffer started russian c...,0.064904,0.244962,0.557051,0.00232,0.130763,Neutral,1,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",1,on friday revealed former milwaukee sheriff da...,sheriff david clarke becomes an internet joke ...,600,16,25,1,5.168966,...,-0.012345,sheriff david clarke becomes an internet joke ...,0.002488,0.433611,0.28146,0.001917,0.280524,Neutral,1,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",1,on christmas day donald trump announced would ...,trump is so obsessed he even has obama’s name ...,475,15,15,1,5.18018,...,-0.023118,trump is so obsessed he even has obama’s name ...,0.002963,0.788261,0.204377,0.00229,0.002109,Neutral,1,0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,1,pope francis used annual christmas day message...,pope francis just called out donald trump duri...,434,12,19,1,4.554762,...,-0.011722,pope francis just called out donald trump duri...,0.292172,0.327938,0.001138,0.020911,0.357842,Neutral,1,0


In [159]:
# Observe which data types to use
dt = pd.DataFrame(data.dtypes).reset_index()
data_numerics = dt[dt[0].isin([np.dtype('int64'), np.dtype('float64')])]
data_text = dt[dt[0].isin([np.dtype('object')])]

In [160]:
dt

Unnamed: 0,index,0
0,title,object
1,text,object
2,class,int64
3,text_without_stopwords,object
4,title_without_stopwords,object
5,text_word_count,int64
6,title_word_count,int64
7,text_sentence_count,int64
8,title_sentence_count,int64
9,text_average_word_length,float64


In [161]:
data_numerics

Unnamed: 0,index,0
2,class,int64
5,text_word_count,int64
6,title_word_count,int64
7,text_sentence_count,int64
8,title_sentence_count,int64
9,text_average_word_length,float64
10,title_average_word_length,float64
11,text_punctuation_count,int64
12,title_punctuation_count,int64
13,text_stopwords_count,int64


In [162]:
data_text

Unnamed: 0,index,0
0,title,object
1,text,object
3,text_without_stopwords,object
4,title_without_stopwords,object
19,overall_content,object
25,polarity_category,object


### Logistic Regression

In [163]:
lr = LogisticRegression(random_state = 0)
scaler = MinMaxScaler()

In [164]:
# X, y split of training data
X_train, X_test, y_train, y_test = train_test_split(data[list(data_numerics['index'])[1:]], data['class'], test_size = 0.2, random_state = 0)

In [165]:
scores, pvalues = chi2(abs(X_train), y_train)

In [166]:
com_dic = {'X2':X_train.columns, 'pvalues':pvalues}
result = pd.DataFrame(com_dic).sort_values(['pvalues'])
result[result['pvalues'] > 0]

Unnamed: 0,X2,pvalues
11,flesch_readability,4.701484e-267
17,Topic 4 Probability,4.1245910000000003e-150
7,title_punctuation_count,2.891466e-88
18,Topic 5 Probability,7.830582e-53
2,text_sentence_count,5.6659910000000003e-39
12,subjectivity,4.585605e-35
3,title_sentence_count,1.253078e-26
5,title_average_word_length,2.496743e-12
19,polarity_category_Neutral,2.070811e-11
16,Topic 3 Probbility,2.60293e-07


In [167]:
# fit the model
X_train = scaler.fit_transform(X_train)
lr_fit = lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

accuracy = accuracy_score(y_test, y_pred) * 100
precision = precision_score(y_test, y_pred) * 100
recall = recall_score(y_test, y_pred) * 100
f1 = f1_score(y_test, y_pred)

results = pd.DataFrame({"Model": 'Logistic Regression',
                        "Accuracy (%)": [accuracy], 
                        "Precision (%)": [precision], 
                        "Recall (%)": [recall], 
                        "F1 Score": [f1]})
results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy (%),Precision (%),Recall (%),F1 Score
0,Logistic Regression,44.749095,44.749095,100.0,0.618299


In [168]:
print(lr.coef_, lr.intercept_)

[[ 3.50681320e+00  3.02285916e+01 -9.92060388e+00  5.01433581e+00
   2.43653672e+00  1.20290388e+00  1.03369538e+00 -2.90890911e+00
   7.82192573e+00 -2.07183385e+01  1.30133690e+00 -3.67836225e+00
   2.43170470e+00  2.60102591e-02 -3.13620495e+00  2.47826891e+00
   5.33627962e-01 -1.42147622e+00  1.35488863e+00 -3.91578783e-02
  -8.45008722e-02]] [-3.87769963]


### CNN

In [169]:
"""
Split the data into real and fake, apply word2vec each, apply class and combine
"""
fake = data[data['class'] == 1]
real = data[data['class'] == 0]

# Linguistic Processing
fake['title_without_stopwords']  = fake['title_without_stopwords'].apply(lambda t: re.sub('[^a-z0-9]', ' ', t))
fake['title_without_stopwords'] = fake['title_without_stopwords'].apply(lambda t: wnl.lemmatize(t))

real['title_without_stopwords']  = real['title_without_stopwords'].apply(lambda t: re.sub('[^a-z0-9]', ' ', t))
real['title_without_stopwords'] = real['title_without_stopwords'].apply(lambda t: wnl.lemmatize(t))

# Tokenize
fake['title_tokenized'] = data['title_without_stopwords'].apply(lambda t: word_tokenize(t))
real['title_tokenized'] = real['title_without_stopwords'].apply(lambda t: word_tokenize(t))

# Embedding
fake_model = Word2Vec(fake['title_tokenized'], min_count=1)
real_model = Word2Vec(real['title_tokenized'], min_count=1)

# Get Norm Vectors
fake_vect = fake_model.wv.get_normed_vectors()
real_vect = real_model.wv.get_normed_vectors()

# Add class
fake_vect = [np.concatenate((vec,[1])) for vec in fake_vect]
real_vect = [np.concatenate((vec,[0])) for vec in real_vect]

fake_vect.extend(real_vect)
random.shuffle(fake_vect)
vect_pd = pd.DataFrame(fake_vect)
vect_pd.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake['title_without_stopwords']  = fake['title_without_stopwords'].apply(lambda t: re.sub('[^a-z0-9]', ' ', t))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake['title_without_stopwords'] = fake['title_without_stopwords'].apply(lambda t: wnl.lemmatize(t))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,-0.153384,0.157413,-0.022656,0.037219,0.027063,-0.177757,0.01869,0.2263,-0.20162,-0.133207,...,-0.034678,0.022188,0.109311,0.187424,0.105319,0.045469,-0.129377,0.03945,0.047787,0.0
1,-0.006082,0.096959,-0.186203,0.054901,-0.008691,-0.046646,0.127368,0.274909,-0.131535,0.019971,...,0.173057,0.116074,0.161901,0.219141,0.058383,-0.006335,-0.195945,-0.143082,-0.056557,1.0
2,0.132909,0.102688,0.04357,0.130463,0.09526,-0.089076,0.075465,0.194728,-0.070718,-0.136775,...,0.036136,0.120661,0.08922,0.192577,0.068442,0.046079,-0.07068,-0.053058,-0.053955,1.0
3,-0.076484,0.124307,-0.006773,0.06344,-0.00771,-0.183439,0.04577,0.258358,-0.118497,-0.114814,...,-0.107686,0.048167,0.083651,0.132422,0.058325,0.053401,-0.065312,-0.002653,0.005582,0.0
4,-0.115217,0.121326,-0.059588,0.053643,0.055595,-0.176988,0.054526,0.261345,-0.192073,-0.129673,...,-0.082523,0.018045,0.096476,0.170662,0.063639,0.04513,-0.09145,0.072571,0.042612,0.0


In [170]:
# Our dictionary will contain only of the top 7000 words appearing most frequently
top_words = 7000

# Now we split our data-set into training and test data
X_train, X_test, y_train, y_test = train_test_split(vect_pd.iloc[:, :-1], vect_pd.iloc[:, -1], test_size = 0.2, random_state = 0)

# Looking at the nature of training data
print('Shape of training data: ')
print(X_train.shape)
print(y_train.shape)

print('Shape of test data: ')
print(X_test.shape)
print(y_test.shape)

# Padding the data samples to a maximum review length in words
max_words = 450

# Remove stopwords, punctuation, lemmatize, before Word2Vec

#X_train = pad_sequences(X_train, maxlen=max_words)
#X_test = pad_sequences(X_test, maxlen=max_words)

# Building the CNN Model
model = Sequential()      # initilaizing the Sequential nature for CNN model

# Adding the embedding layer which will take in maximum of 450 words as input and provide a 32 dimensional output of those words which belong in the top_words dictionary
model.add(Embedding(top_words, 32, input_length=100))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build()
model.summary()

# Fitting the data onto model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=2)

# Getting score metrics from our model
scores = model.evaluate(X_test, y_test, verbose=0)

# Displays the accuracy of correct sentiment prediction over test data
print("Accuracy: %.2f%%" % (scores[1]*100))

Shape of training data: 
(25918, 100)
(25918,)
Shape of test data: 
(6480, 100)
(6480,)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 32)           224000    
                                                                 
 conv1d_2 (Conv1D)           (None, 100, 32)           3104      
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 50, 32)           0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 1600)              0         
                                                                 
 dense_4 (Dense)             (None, 250)               400250    
                                                                 
 dense_5 (Dense)             (No