# FAKE NEWS Identification (kaggle)

Develop a machine learning program to identify when an article might be fake news.

train.csv: A full training dataset with the following attributes:

* id: unique id for a news article
* title: the title of a news article
* author: author of the news article
* text: the text of the article; could be incomplete
* label: a label that marks the article as potentially unreliable
  * 1: unreliable
  * 0: reliable

test.csv: A testing training dataset with all the same attributes at train.csv without the label.

In [50]:
import pandas as pd 

df = pd.read_csv('/content/sample_data/train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [51]:
df.dtypes

id         int64
title     object
author    object
text      object
label      int64
dtype: object

In [52]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [53]:
df.shape

(20800, 5)

In [54]:
# dropping null values 
df = df.dropna()

In [55]:
# independent and dependent features 
x = df.drop('label', axis=1)
y = df['label']

x.shape, y.shape

((18285, 4), (18285,))

## one hot representation 

In [56]:
import tensorflow as tf 
from tensorflow.keras.layers import Embedding 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
# pad_sequences is used for keeping the input_length fixed either at pre or post side 
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

voc_size = 5000 # vocabulary size

In [57]:
messages = x.copy()

In [58]:
messages['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [59]:
messages.reset_index(inplace=True) 
# resetting coz earlier dropped nan values 

In [60]:
import nltk 
import re
from nltk.corpus import stopwords

In [61]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [62]:
# dataset preprocessing 

from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

corpus = []

for i in range(0, len(messages)):
  print(i)
  review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
  # substituting every characters with blank spaces on title feature
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
13285
13286
13287
13288
13289
13290
13291
13292
13293
13294
13295
13296
13297
13298
13299
13300
13301
13302
13303
13304
13305
13306
13307
13308
13309
13310
13311
13312
13313
13314
13315
13316
13317
13318
13319
13320
13321
13322
13323
13324
13325
13326
13327
13328
13329
13330
13331
13332
13333
13334
13335
13336
13337
13338
13339
13340
13341
13342
13343
13344
13345
13346
13347
13348
13349
13350
13351
13352
13353
13354
13355
13356
13357
13358
13359
13360
13361
13362
13363
13364
13365
13366
13367
13368
13369
13370
13371
13372
13373
13374
13375
13376
13377
13378
13379
13380
13381
13382
13383
13384
13385
13386
13387
13388
13389
13390
13391
13392
13393
13394
13395
13396
13397
13398
13399
13400
13401
13402
13403
13404
13405
13406
13407
13408
13409
13410
13411
13412
13413
13414
13415
13416
13417
13418
13419
13420
13421
13422
13423
13424
13425
13426
13427
13428
13429
13430
13431
13432
13433
13434
13435
13436
13437
13438
13439
13440

In [63]:
corpus

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri',
 'jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart',
 'beno hamon win french socialist parti presidenti nomin new york time',
 'back channel plan ukrain russia courtesi trump associ new york time',
 'obama organ action partner soro link indivis disrupt trump agenda',
 'bbc comedi sketch real housew isi caus outrag',
 'russian research discov secret nazi militari base treasur hunter arctic photo',
 'us offici see link trump russia',
 'ye paid govern troll social media blog forum websit',
 'major leagu soccer argentin find home success new york time',
 'well fargo chief abruptli step new york time',
 'anonym donor pay million releas everyon arrest dakota access pipelin',
 'fbi close hilla

In [64]:
onehot_repr = [one_hot(words, voc_size) for words in corpus]
onehot_repr

[[865, 3005, 3433, 428, 2997, 1789, 346, 1125, 1893, 4150],
 [389, 4894, 178, 4204, 4155, 1761, 2908],
 [2711, 972, 3884, 1756],
 [2610, 1968, 479, 56, 2380, 4519],
 [395, 4155, 3414, 3636, 3266, 209, 4155, 1634, 4859, 3226],
 [2528,
  2676,
  331,
  4889,
  2859,
  242,
  1626,
  4690,
  120,
  1792,
  2085,
  3914,
  4973,
  4615,
  2908],
 [3005, 4390, 4214, 1431, 3230, 64, 212, 1585, 3829, 2608, 2691],
 [3976, 2587, 381, 3184, 1250, 247, 242, 4175, 3829, 2608, 2691],
 [1853, 2592, 4178, 4584, 91, 3040, 1433, 980, 242, 3642],
 [526, 629, 1412, 2456, 2298, 3087, 703, 3268],
 [700, 1415, 404, 3597, 4466, 1927, 4803, 4408, 541, 3229, 3020],
 [56, 3996, 2997, 3040, 242, 1250],
 [2530, 1261, 324, 3534, 1149, 2076, 2183, 3908, 4745],
 [4862, 2852, 2390, 629, 392, 1321, 539, 3829, 2608, 2691],
 [3989, 2960, 1625, 1211, 3537, 3829, 2608, 2691],
 [2820, 2469, 862, 4299, 2986, 4539, 1607, 3796, 747, 1550],
 [309, 61, 4894],
 [4843, 1682, 2303, 3636, 242, 985, 3770, 2908],
 [314, 1583, 178, 13

## Embedding representation

In [65]:
sent_length = 20

embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
embedded_docs

array([[   0,    0,    0, ..., 1125, 1893, 4150],
       [   0,    0,    0, ..., 4155, 1761, 2908],
       [   0,    0,    0, ...,  972, 3884, 1756],
       ...,
       [   0,    0,    0, ..., 3829, 2608, 2691],
       [   0,    0,    0, ...,  948, 3326,  263],
       [   0,    0,    0, ..., 4158, 4396, 2828]], dtype=int32)

In [66]:
embedded_docs[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  389, 4894,  178, 4204, 4155, 1761, 2908], dtype=int32)

## Building LSTM

In [67]:
embedding_vector_features = 40

model = Sequential()

model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))

model.add(LSTM(100)) # one layer with 100 neurons 

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________


In [68]:
len(embedded_docs)

18285

In [69]:
type(embedded_docs)

numpy.ndarray

In [70]:
import numpy as np 

x = np.array(embedded_docs)
y = np.array(y)

In [71]:
type(x), type(y)

(numpy.ndarray, numpy.ndarray)

In [72]:
x.shape, y.shape

((18285, 20), (18285,))

In [73]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=4)

In [74]:
# training the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f6c231de590>

In [75]:
y_pred = np.argmax(model.predict(x_test), axis=-1)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [76]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[3165,    0],
       [2321,    0]])

In [77]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5769230769230769

In [78]:
# adding dropout 

from tensorflow.keras.layers import Dropout

embedding_vector_features = 40 

model = Sequential()

model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))

model.add(Dropout(0.3))

model.add(LSTM(100))

model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f6c22291750>

In [79]:
y_pred = np.argmax(model.predict(x_test), axis=-1)

confusion_matrix(y_test, y_pred)

array([[3165,    0],
       [2321,    0]])

In [80]:
accuracy_score(y_test, y_pred)

0.5769230769230769

In [83]:
# Making Predictions on test data
test = pd.read_csv('/content/sample_data/test.csv')
predictions_test = pd.DataFrame(model.predict_classes(x_test))
test_id = pd.DataFrame(test["id"])
submission = pd.concat([test_id,predictions_test],axis=1)
submission.columns = ["id","label"]
submission.to_csv("Submission.csv",index=False)

