In [5]:
import os 

os.environ['KAGGLE_CONFIG_DIR'] = "/content" 
!chmod 600 /content/kaggle.json 
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

Downloading fake-and-real-news-dataset.zip to /content
 81% 33.0M/41.0M [00:00<00:00, 44.0MB/s]
100% 41.0M/41.0M [00:00<00:00, 45.0MB/s]


In [6]:
!unzip /content/fake-and-real-news-dataset.zip

Archive:  /content/fake-and-real-news-dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                


In [7]:
import pandas as pd
import numpy as np

In [8]:
df1 = pd.read_csv("/content/Fake.csv")
df2 = pd.read_csv("/content/True.csv")


In [9]:
df1['label'] = 1
df2['label'] =0

In [10]:
data = pd.merge(df1,df2, on=['label','title'],how ="outer")

In [11]:
data.drop(columns=['text_x','subject_x','date_x','text_y','subject_y','date_y'], inplace=True)

In [12]:
data.head()

Unnamed: 0,title,label
0,Donald Trump Sends Out Embarrassing New Year’...,1
1,Drunk Bragging Trump Staffer Started Russian ...,1
2,Sheriff David Clarke Becomes An Internet Joke...,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,1
4,Pope Francis Just Called Out Donald Trump Dur...,1


In [13]:
data.isna().sum()

title    0
label    0
dtype: int64

In [14]:
data.duplicated().sum()

6169

In [15]:
len(data)

44898

In [16]:
data.drop_duplicates(inplace=True,keep='first')

In [17]:
len(data)

38729

In [18]:
import re 

import nltk 
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 

from nltk.corpus import stopwords 

from nltk.stem import PorterStemmer 


def preprocess_text(text): 

    # Lowercasing 

    text = text.lower() 

    # Removing HTML tags 

    text = re.sub(r'<.*?>', '', text) 

    # Removing special characters and punctuations 

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) 

    # Removing numbers 

    text = re.sub(r'\d+', '', text) 
     

    # Tokenization 

    tokens = word_tokenize(text) 


    # Stopword removal 

    stop_words = set(stopwords.words("english")) 

    filtered_tokens = [token for token in tokens if token not in stop_words] 

    # Stemming 

    stemmer = PorterStemmer() 

    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens] 

    # Final preprocessed text 

    preprocessed_text = " ".join(stemmed_tokens) 

     

    return preprocessed_text 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
data.iloc[:].title = data['title'].apply(preprocess_text)

In [20]:
data.head()

Unnamed: 0,title,label
0,donald trump send embarrass new year eve messa...,1
1,drunk brag trump staffer start russian collus ...,1
2,sheriff david clark becom internet joke threat...,1
3,trump obsess even obama name code websit imag,1
4,pope franci call donald trump christma speech,1


In [21]:
data.label.value_counts()

0    20826
1    17903
Name: label, dtype: int64

In [22]:
x= data.drop('label', axis =1)
y = data['label']

In [23]:
x.shape

(38729, 1)

In [24]:
y.shape

(38729,)

In [25]:
import tensorflow as tf
tf.__version__

'2.9.2'

In [26]:
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential

In [27]:
voc_size =5000

## One hot representation


In [28]:
message = x.title.values.tolist()

In [29]:
message[0]

'donald trump send embarrass new year eve messag disturb'

In [30]:
one_hot_rep = [one_hot(text,voc_size) for text in message]

In [31]:
len(one_hot_rep[0])

9

In [32]:
# Embedding Representation

sent_len =40

embedded_doc = pad_sequences(one_hot_rep,padding ='pre',maxlen =sent_len)
print(embedded_doc)

[[   0    0    0 ... 3872 3878 1956]
 [   0    0    0 ... 2491 2482 1414]
 [   0    0    0 ... 4645   58 3543]
 ...
 [   0    0    0 ...  872 1822 2445]
 [   0    0    0 ... 2217 4918 4804]
 [   0    0    0 ...  576 2491 3341]]


In [33]:
len(embedded_doc[0])

200

In [34]:
#creating model
embedding_vector_features =40
model =Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length =sent_len))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss ='binary_crossentropy',optimizer ='adam',metrics =['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 80)           400000    
                                                                 
 dropout (Dropout)           (None, 200, 80)           0         
                                                                 
 lstm (LSTM)                 (None, 100)               72400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 472,501
Trainable params: 472,501
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
x_final =np.array(embedded_doc)
y_final =np.array(y)

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x_final,y_final, test_size =0.33, random_state = 34)

In [36]:
#Model Training

model.fit(x_train,y_train,validation_data =(x_test,y_test),epochs =3, batch_size =64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fbce4ba0af0>

In [40]:
#Performance Metrics
threshold =0.6
ypred = model.predict(x_test)

ypred =np.where(ypred > threshold, 1,0)



from sklearn.metrics import confusion_matrix,classification_report

confusion_matrix(y_test,ypred)



array([[6613,  270],
       [ 576, 5322]])

In [42]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      6883
           1       0.95      0.90      0.93      5898

    accuracy                           0.93     12781
   macro avg       0.94      0.93      0.93     12781
weighted avg       0.93      0.93      0.93     12781

