In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Fake_news.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [4]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [5]:
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   71576 non-null  object
 1   text    72095 non-null  object
 2   label   72134 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.7+ MB


In [7]:
df.isna().sum()

title    558
text      39
label      0
dtype: int64

In [8]:
df = df.dropna()

In [9]:
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [10]:
df.label.value_counts()

label
1    36509
0    35028
Name: count, dtype: int64

In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [12]:
def remove_numeric(text):
  pattern = re.compile(r'[^a-zA-Z\s]')
  cleaned = pattern.sub('', text)
  return cleaned

In [13]:
df['title'] = df['title'].apply(remove_numeric)

In [14]:
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,UNBELIEVABLE OBAMAS ATTORNEY GENERAL SAYS MOST...,"Now, most of the demonstrators gathered last ...",1
3,Bobby Jindal raised Hindu uses story of Christ...,A dozen politically active pastors came here f...,0
4,SATAN Russia unvelis an image of its terrifyi...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,About Time Christian Group Sues Amazon and SPL...,All we can say on this one is it s about time ...,1


In [15]:
df.drop('text', axis = 1, inplace = True)

In [16]:
df.head()

Unnamed: 0,title,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
2,UNBELIEVABLE OBAMAS ATTORNEY GENERAL SAYS MOST...,1
3,Bobby Jindal raised Hindu uses story of Christ...,0
4,SATAN Russia unvelis an image of its terrifyi...,1
5,About Time Christian Group Sues Amazon and SPL...,1


In [17]:
df['title'] = df['title'].str.lower()

In [18]:
df.head()

Unnamed: 0,title,label
0,law enforcement on high alert following threat...,1
2,unbelievable obamas attorney general says most...,1
3,bobby jindal raised hindu uses story of christ...,0
4,satan russia unvelis an image of its terrifyi...,1
5,about time christian group sues amazon and spl...,1


In [19]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [20]:
corpus = df['title']
stop_words = set(stopwords.words('english'))

In [21]:
def nlp(text):
    words = word_tokenize(text)
    review = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    review = ' '.join(review)
    return review

In [22]:
df['review'] = df['title'].apply(nlp)

In [23]:
df.head()

Unnamed: 0,title,label,review
0,law enforcement on high alert following threat...,1,law enforcement high alert following threat co...
2,unbelievable obamas attorney general says most...,1,unbelievable obamas attorney general say charl...
3,bobby jindal raised hindu uses story of christ...,0,bobby jindal raised hindu us story christian c...
4,satan russia unvelis an image of its terrifyi...,1,satan russia unvelis image terrifying new supe...
5,about time christian group sues amazon and spl...,1,time christian group sue amazon splc designati...


In [30]:
corpus = df['review']

In [31]:
corpus

0        law enforcement high alert following threat co...
2        unbelievable obamas attorney general say charl...
3        bobby jindal raised hindu us story christian c...
4        satan russia unvelis image terrifying new supe...
5        time christian group sue amazon splc designati...
                               ...                        
72129    russian steal research trump hack u democratic...
72130    watch giuliani demand democrat apologize trump...
72131      migrant refuse leave train refugee camp hungary
72132    trump tussle give unpopular mexican leader muc...
72133     goldman sachs endorses hillary clinton president
Name: review, Length: 71537, dtype: object

In [45]:
import tensorflow
from tensorflow import keras
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout

In [28]:
vocab_size = 5000

In [32]:
onehot = [one_hot(word, vocab_size) for word in corpus]

In [36]:
onehot[45]

[1675, 2507, 2888, 166, 3979, 3896, 2061, 1175]

In [37]:
sent_length = 20

In [39]:
embedded_doc = pad_sequences(onehot, padding = 'pre', maxlen = sent_length)

In [40]:
embedded_doc

array([[   0,    0,    0, ..., 2524, 3846, 1305],
       [   0,    0,    0, ...,  874, 2261, 1305],
       [   0,    0,    0, ..., 4341, 3755, 1431],
       ...,
       [   0,    0,    0, ..., 4550,   84, 3040],
       [   0,    0,    0, ...,   70,  128,  982],
       [   0,    0,    0, ..., 2349, 3964, 2556]])

In [66]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(32))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [68]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [48]:
model.summary()

In [50]:
len(embedded_doc)

71537

In [51]:
y = df.label

In [54]:
import numpy as np

In [58]:
X_final = np.array(embedded_doc)

In [59]:
y_final = np.array(y)

In [60]:
X_final.shape

(71537, 20)

In [61]:
from sklearn.model_selection import train_test_split

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.2, random_state = 2022)

In [65]:
X_train.shape

(57229, 20)

In [69]:
model.fit(X_train, y_train, epochs = 5)

Epoch 1/5
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 23ms/step - accuracy: 0.8145 - loss: 0.3893
Epoch 2/5
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 24ms/step - accuracy: 0.9135 - loss: 0.2177
Epoch 3/5
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.9323 - loss: 0.1727
Epoch 4/5
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 23ms/step - accuracy: 0.9457 - loss: 0.1413
Epoch 5/5
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 26ms/step - accuracy: 0.9580 - loss: 0.1080


<keras.src.callbacks.history.History at 0x271074fb950>

In [70]:
from sklearn.metrics import classification_report

In [81]:
y_pred_prob = model.predict(X_test)

[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step


In [78]:
y_pred_1 = model.predict(X_train)

[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 9ms/step


In [76]:
y_test.min()

0

In [80]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [82]:
threshold = 0.5
y_pred = (y_pred_prob >= threshold).astype(int)

# Now you can use the classification metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Accuracy: 0.888663684651943
Precision: 0.881170991350632
Recall: 0.904397705544933


In [84]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      6986
           1       0.88      0.90      0.89      7322

    accuracy                           0.89     14308
   macro avg       0.89      0.89      0.89     14308
weighted avg       0.89      0.89      0.89     14308



In [87]:
from keras.models import load_model

# Save model
model.save("Fake_news.h5")

