# **Fake News Detection**

In [1]:
# Import Data
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/fake-news-detection/data.h5
/kaggle/input/fake-news-detection/data.csv


## **Data Exploration**

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('/kaggle/input/fake-news-detection/data.csv')

In [4]:
data.head(10)

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1
5,http://beforeitsnews.com/sports/2017/09/jetnat...,JetNation FanDuel League; Week 4,JetNation FanDuel League; Week 4\n% of readers...,0
6,https://www.nytimes.com/2017/10/10/us/politics...,Kansas Tried a Tax Plan Similar to Trump’s. It...,"In 2012, Kansas lawmakers, led by Gov. Sam Bro...",1
7,https://www.reuters.com/article/us-india-cenba...,"India RBI chief: growth important, but not at ...",The Reserve Bank of India (RBI) Governor Urjit...,1
8,https://www.reuters.com/article/us-climatechan...,EPA chief to sign rule on Clean Power Plan exi...,"Scott Pruitt, Administrator of the U.S. Enviro...",1
9,https://www.reuters.com/article/us-air-berlin-...,Talks on sale of Air Berlin planes to easyJet ...,FILE PHOTO - An Air Berlin sign is seen at an ...,1


In [5]:
data.tail()

Unnamed: 0,URLs,Headline,Body,Label
4004,http://beforeitsnews.com/sports/2017/09/trends...,Trends to Watch,Trends to Watch\n% of readers think this story...,0
4005,http://beforeitsnews.com/u-s-politics/2017/10/...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,0
4006,https://www.activistpost.com/2017/09/ron-paul-...,"Ron Paul on Trump, Anarchism & the AltRight",,0
4007,https://www.reuters.com/article/us-china-pharm...,China to accept overseas trial data in bid to ...,SHANGHAI (Reuters) - China said it plans to ac...,1
4008,http://beforeitsnews.com/u-s-politics/2017/10/...,Vice President Mike Pence Leaves NFL Game Beca...,Vice President Mike Pence Leaves NFL Game Beca...,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URLs      4009 non-null   object
 1   Headline  4009 non-null   object
 2   Body      3988 non-null   object
 3   Label     4009 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 125.4+ KB


In [7]:
data.shape

(4009, 4)

In [8]:
data.isnull().sum() 

URLs         0
Headline     0
Body        21
Label        0
dtype: int64

## **Data Cleaning**

In [9]:
data= data.dropna() # remove null values
data.isnull().sum()

URLs        0
Headline    0
Body        0
Label       0
dtype: int64

In [10]:
data.shape

(3988, 4)

In [11]:
# combine headline and body in text variable
data.loc[:, 'text'] = data['Headline'].fillna('') + ' ' + data['Body'].fillna('')

In [12]:
data['text']

0       Four ways Bob Corker skewered Donald Trump Ima...
1       Linklater's war veteran comedy speaks to moder...
2       Trump’s Fight With Corker Jeopardizes His Legi...
3       Egypt's Cheiron wins tie-up with Pemex for Mex...
4       Jason Aldean opens 'SNL' with Vegas tribute Co...
                              ...                        
4003    CNN and Globalist Exposed - Steve Quayle and A...
4004    Trends to Watch Trends to Watch\n% of readers ...
4005    Trump Jr. Is Soon To Give A 30-Minute Speech F...
4007    China to accept overseas trial data in bid to ...
4008    Vice President Mike Pence Leaves NFL Game Beca...
Name: text, Length: 3988, dtype: object

In [13]:
data.dropna()
data.isnull().sum()

URLs        0
Headline    0
Body        0
Label       0
text        0
dtype: int64

In [14]:
data.shape

(3988, 5)

In [15]:
# remove uncessary columns
data = data.drop(columns=['Headline', 'Body', 'URLs'])
data.head()

Unnamed: 0,Label,text
0,1,Four ways Bob Corker skewered Donald Trump Ima...
1,1,Linklater's war veteran comedy speaks to moder...
2,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,1,Jason Aldean opens 'SNL' with Vegas tribute Co...


In [16]:
X = data['text']
y = data['Label']

In [17]:
# Train Test Split
from sklearn.model_selection import train_test_split

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [19]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(3190,)
(798,)
(3190,)
(798,)


## **Text Preprocessing**

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [21]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

In [22]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet') 

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
# Initialize stopwords, stemmer, and lemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [24]:
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    words = [stemmer.stem(word) for word in words]  # Stemming
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return " ".join(words)

In [25]:
X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
vectorizer = TfidfVectorizer()
X_train_V = vectorizer.fit_transform(X_train)
X_test_V = vectorizer.transform(X_test)

## **Model Training**

### **Naive Bayes**

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [29]:
nb_model = MultinomialNB()
nb_model = nb_model.fit(X_train_V, y_train)

In [30]:
nb_prediction = nb_model.predict(X_test_V)

In [31]:
nb_accuracy = accuracy_score(y_test, nb_prediction)
print(f'Naïve Bayes Accuracy: {nb_accuracy:.4f}')

Naïve Bayes Accuracy: 0.9148


### **Random Forest**

In [32]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model = rf_model.fit(X_train_V, y_train)

In [34]:
rf_prediction = rf_model.predict(X_test_V)

In [35]:
rf_accuracy = accuracy_score(y_test, rf_prediction)
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')

Random Forest Accuracy: 0.9674


### **LSTM**

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [37]:
tokenizer = Tokenizer(num_words=500)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [38]:
max_length = max(len(seq) for seq in X_train_seq)

In [39]:
X_train_pad = pad_sequences(X_train_seq,maxlen= max_length, padding= 'post')
X_test_pad = pad_sequences(X_test_seq, maxlen= max_length, padding= 'post')

In [40]:
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

In [41]:
lstm_model = Sequential([
    Embedding(input_dim=5000, output_dim= 128),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1,activation='sigmoid')
])

In [42]:
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_pad, y_train_np, epochs=5, batch_size=128, validation_data=(X_test_pad,y_test_np))

Epoch 1/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 2s/step - accuracy: 0.5222 - loss: 0.6928 - val_accuracy: 0.4361 - val_loss: 0.6946
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2s/step - accuracy: 0.4890 - loss: 0.6937 - val_accuracy: 0.5639 - val_loss: 0.6890
Epoch 3/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 2s/step - accuracy: 0.5160 - loss: 0.6928 - val_accuracy: 0.5639 - val_loss: 0.6884
Epoch 4/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 2s/step - accuracy: 0.5371 - loss: 0.6912 - val_accuracy: 0.5639 - val_loss: 0.6893
Epoch 5/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 2s/step - accuracy: 0.5282 - loss: 0.6921 - val_accuracy: 0.5639 - val_loss: 0.6893


<keras.src.callbacks.history.History at 0x7e2b8d535d20>

In [43]:
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test_np, verbose=0)
print(f'LSTM Accuracy: {lstm_accuracy:.4f}')

LSTM Accuracy: 0.5639


### **Accuracy**

In [44]:
def sort_model_accuracies(model_accuracies):
    sorted_accuracies = sorted(model_accuracies.items(), key=lambda x: x[1], reverse=True)
    print("\n Model Accuracies:")
    for model, accuracy in sorted_accuracies:
        print(f"{model}: {accuracy:.4f}")

model_accuracies = {
    'Naïve Bayes': nb_accuracy,
    'Random Forest': rf_accuracy,
    'LSTM': lstm_accuracy
}

sort_model_accuracies(model_accuracies)


 Model Accuracies:
Random Forest: 0.9674
Naïve Bayes: 0.9148
LSTM: 0.5639


In [45]:
# save random forest model
import pickle
import joblib

In [46]:
joblib.dump(rf_model, 'rf_model.pkl')

['rf_model.pkl']

In [47]:
# save vectorize file
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']