In [2]:
import pandas as pd
import numpy as np
df=pd.read_csv("news.csv")

In [3]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [5]:
df.duplicated().sum()

0

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
cls_dist=df['label'].value_counts()
cls_dist

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [7]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split

max_words = 1000  # Max vocabulary size
max_len = 10      # Max sequence length
embedding_dim = 50  # Dimension of the embedding layer
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.25, random_state=42)


In [8]:
# Tokenize the texts
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
# Pad the sequences to the same length
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')
print("Padded Training Data:\n", X_train_padded)


Padded Training Data:
 [[ 68   1  81 ...   1   5 193]
 [ 34 906   8 ...   1 870   1]
 [ 77   2  54 ... 211   1   1]
 ...
 [129  35 200 ...   1  24   1]
 [  1   2   1 ...   1   1   1]
 [  1   5   1 ...   5   1   1]]


In [9]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(LSTM(64, return_sequences=False))  # You can increase the number of units if needed
model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None,max_len))




In [10]:
model.summary()

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)


In [12]:
# Train the model
epochs = 10
history = model.fit(X_train_padded,y_train_encoded, epochs=epochs, validation_data=(X_test_padded,y_test_encoded), verbose=2)


Epoch 1/10
149/149 - 7s - 44ms/step - accuracy: 0.6695 - loss: 0.5929 - val_accuracy: 0.7412 - val_loss: 0.5071
Epoch 2/10
149/149 - 1s - 10ms/step - accuracy: 0.7800 - loss: 0.4541 - val_accuracy: 0.7424 - val_loss: 0.4972
Epoch 3/10
149/149 - 1s - 10ms/step - accuracy: 0.8085 - loss: 0.4011 - val_accuracy: 0.7393 - val_loss: 0.5106
Epoch 4/10
149/149 - 2s - 11ms/step - accuracy: 0.8375 - loss: 0.3640 - val_accuracy: 0.7399 - val_loss: 0.5446
Epoch 5/10
149/149 - 1s - 10ms/step - accuracy: 0.8501 - loss: 0.3385 - val_accuracy: 0.7355 - val_loss: 0.5749
Epoch 6/10
149/149 - 2s - 10ms/step - accuracy: 0.8649 - loss: 0.3122 - val_accuracy: 0.7361 - val_loss: 0.6043
Epoch 7/10
149/149 - 2s - 12ms/step - accuracy: 0.8733 - loss: 0.2907 - val_accuracy: 0.7418 - val_loss: 0.6409
Epoch 8/10
149/149 - 1s - 10ms/step - accuracy: 0.8880 - loss: 0.2682 - val_accuracy: 0.7285 - val_loss: 0.6616
Epoch 9/10
149/149 - 2s - 11ms/step - accuracy: 0.9002 - loss: 0.2495 - val_accuracy: 0.7317 - val_loss:

In [13]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded,y_test_encoded, verbose=2)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


50/50 - 0s - 6ms/step - accuracy: 0.7279 - loss: 0.7969
Test Accuracy: 72.79%


In [14]:
model.save('text_classification.h5')

