In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/



In [None]:
df=pd.read_csv("/kaggle/input/final-fakenews-dataset/final.csv")

In [None]:
df.head()


In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df[df['title'].isnull() & (df['text'].isnull())]

In [None]:
df=df.dropna(subset=['title'])

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.duplicated().sum()

In [None]:
df=df.drop_duplicates()

In [None]:
df.shape

In [None]:
df['label'].value_counts()

In [None]:
# Visualize distribution of categories
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=df)
plt.title('Distribution of Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()

In [None]:
# Convert NaN values to empty strings and non-string elements to strings
df['title'] = df['title'].astype(str)
df['text'] = df['text'].astype(str)

# Calculate text length
df['Title_Length'] = df['title'].apply(len)
df['Article_Length'] = df['text'].apply(len)


In [None]:
# Visualize text length distributions
plt.figure(figsize=(10, 6))
sns.histplot(df['Title_Length'], bins=50, kde=True)
plt.title('Distribution of Title Length')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Article_Length'], bins=50, kde=True)
plt.title('Distribution of Article Length')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
df['title']=df['title'].str.strip()
df['title'] = df['title'].str.replace('\n', '')
df['text']=df['text'].str.strip()
df['text'] = df['text'].str.replace('\n', '')

In [None]:
df['text'] = df['text'].fillna('')

In [None]:
df['Text']=df['title']+" "+df['text']
df['Text']

In [None]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    filtered_words = [word for word in text.split() if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [None]:
special_chars_pattern = r'[^a-zA-Z0-9\s]'
def remove_special_chars(text):
    return re.sub(special_chars_pattern, '', text)

In [None]:
def remove_numbers(input_string):
    output_string = re.sub(r'\d+', '', input_string)
    return output_string

In [None]:
def convert_lower(text):
    return text.lower()

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

In [None]:
df['Text'] = df['Text'].apply(remove_stopwords)
df['Text'] = df['Text'].apply(remove_special_chars)
df['Text'] = df['Text'].apply(remove_numbers)
df['Text'] = df['Text'].apply(convert_lower)
df['Text'] = df['Text'].apply(lemmatize_words)


In [None]:
#declare dependent and independent value

x = df['Text']
y = df['label']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [None]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

# Transform the test data using the fitted vectorizer
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [None]:
# Tokenize and pad sequences (if intending to use sequence-based models)
max_words = 5000
max_len = 250
tokenizer = Tokenizer(num_words=max_words)
# Fit Tokenizer on training data
tokenizer.fit_on_texts(x_train)

In [None]:
# Convert text data to sequences and pad
sequences_train = tokenizer.texts_to_sequences(x_train)
sequences_test = tokenizer.texts_to_sequences(x_test)
x_train_pad = pad_sequences(sequences_train, maxlen=max_len)
x_test_pad = pad_sequences(sequences_test, maxlen=max_len)

In [None]:
#Creating the lstm model
embedding_vector_features=128
model=Sequential()
model.add(Embedding(max_words,embedding_vector_features,input_length=max_len))
model.add(Dropout(0.3))
model.add(LSTM(100)) #Adding 100 lstm neurons in the layer
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

#Compiling the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(x_train_pad, y_train, epochs=10, batch_size=32, validation_data=(x_test_pad, y_test))


In [None]:
# Evaluate the model
y_pred = (model.predict(x_test_pad) > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Creating bidirectional lstm model
embedding_vector_features=128
model1=Sequential()
model1.add(Embedding(max_words,embedding_vector_features,input_length=max_len))
model1.add(Bidirectional(LSTM(100))) # Bidirectional LSTM layer
model1.add(Dropout(0.3))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
model.fit(x_train_pad, y_train, epochs=10, batch_size=32, validation_data=(x_test_pad, y_test))


In [None]:
# Evaluate the model
y_pred = (model.predict(x_test_pad) > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))