# Importing Libraries

In [57]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk import wsd
from nltk.corpus import wordnet as wn
from spacy.cli import download
from spacy import load
import warnings
import numpy as np

# Creating Dataframe

In [58]:
df = pd.read_csv('/kaggle/input/fake-news/train.csv')

In [59]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [60]:
df = df.drop('id',axis = 1)

In [61]:
df = df.dropna()

In [62]:
df.isnull().sum()

title     0
author    0
text      0
label     0
dtype: int64

Preprocessing of data

In [63]:
df2 = df.drop(['title','author'],axis = 1)

In [64]:
df2

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1
...,...,...
20795,Rapper T. I. unloaded on black celebrities who...,0
20796,When the Green Bay Packers lost to the Washing...,0
20797,The Macy’s of today grew from the union of sev...,0
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [65]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18285 entries, 0 to 20799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    18285 non-null  object
 1   label   18285 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 428.6+ KB


In [66]:
df2 = df2.dropna()

In [67]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18285 entries, 0 to 20799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    18285 non-null  object
 1   label   18285 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 428.6+ KB


# Performing pre-processing on text

In [68]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('wordnet2022')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet2022 to /usr/share/nltk_data...
[nltk_data]   Package wordnet2022 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [69]:
nlp = load('en_core_web_sm')

In [70]:
! cp -rf /usr/share/nltk_data/corpora/wordnet2022 /usr/share/nltk_data/corpora/wordnet # temp fix for lookup error.

In [71]:
print(set(stopwords.words('english')))

{'as', 're', 'what', 'during', 'itself', 'is', "that'll", 'an', "couldn't", 'mightn', 'some', 'more', 'ain', 'their', 'had', 'after', "should've", 'too', 'so', "isn't", 'you', 'shouldn', "you've", 'such', 'mustn', 'no', 'are', 'aren', 'further', 'those', 'my', 'didn', 'the', 'did', 'when', 'whom', 't', 'how', 'am', 'then', 'to', 'does', 'this', 'yourself', 'other', 'here', 'd', 'in', 'on', "didn't", 'its', 'with', 'hers', 'any', "you'll", 'have', 'having', "haven't", 'there', 'hasn', 'against', 'they', 'than', 'his', 'it', 'few', 'won', "hasn't", 'weren', 'between', 'will', 'o', 'him', 'herself', 'needn', 'ma', "she's", 's', 'or', 'yourselves', 'i', 'most', 'by', 'for', 'he', 'if', 'down', 'once', 'should', 'theirs', 'very', 'y', 'and', 'but', 'haven', 'has', 'them', 've', 'only', 'couldn', 'not', 'don', 'all', "weren't", 'at', 'she', 'myself', 'your', "mightn't", 'a', 'yours', 'himself', 'isn', 'ourselves', 'both', 'll', "mustn't", 'just', 'do', 'these', 'until', 'been', 'being', "sho

In [72]:
import nltk
from nltk.corpus import stopwords
import re

def preprocess_corpus(messages):
    corpus = []
    for text in messages:
        text = str(text)  # Convert to string
        review = re.sub('[^a-zA-Z]', ' ', text)
        review = review.lower()
        review = review.split()
        review = [word for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
        
    return corpus

In [73]:
# # Assuming 'df' is your DataFrame containing the 'text' and 'label' columns
messages = df2['text']
preprocessed_corpus = preprocess_corpus(messages)

In [74]:
## TFidf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v=TfidfVectorizer(max_features=5000,ngram_range=(1,3))

In [75]:
X=tfidf_v.fit_transform(preprocessed_corpus).toarray()

In [76]:
y=df2['label']

# Splitting into test and train

In [77]:
## Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [78]:
X_train.shape

(12250, 5000)

# Creating Sequential Model

In [79]:
import tensorflow as tf
from tensorflow import keras

# Define the architecture of the DNN model
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(5000,)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation='sigmoid')
])


# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model to the training data
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.2)

# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test, y_test)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Evaluating Accuracy of model

In [80]:
# from skargmaxlearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score

# Predict the labels for the test data
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9501242750621375


In [81]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Predict the probabilities for the test data
y_pred_probs = model.predict(X_test)
# Convert probabilities to binary predictions
y_pred = np.round(y_pred_probs).astype(int)

# Convert the predicted labels and true labels to 1D arrays
y_pred = np.squeeze(y_pred)
y_true = np.squeeze(y_test)

# Calculate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[3264  125]
 [ 176 2470]]


# Evaluting model with input text

In [82]:
test_text = """India’s Manipur state is on the boil, with social media flooded with visuals of people killed and injured in armed attacks. Violent protests broke out after an attack on June 29, while a church was looted and burned down two days prior.
Over the past two months, Manipur’s largely Hindu Meitei community, which constitute a little over half of the state’s population, and the Christian-majority Kuki tribal group, which makes up about 16 percent of the population, have violently attacked each other in an outpouring of recrimination and revenge. Over 100 people have been killed and nearly 40,000 displaced. Angry mobs and armed vigilantes have burned down homes, churches, and offices.
Manipur has long faced secessionist insurgencies in which both military and state security forces have committed serious human rights abuses. Longstanding ethnic disputes have also  erupted into violence. However, instead of adopting measures that would ensure the security of all communities, the Bharatiya Janata Party government of N. Biren Singh in Manipur state has replicated the national party’s politically motivated divisive policies that promote Hindu majoritarianism.
Many Meitei seek the same affirmative action privileges that are provided to the Kuki under their protected tribal status. Tribal groups, particularly the Kuki, have argued that this would expand Meitei economic dominance and allow them to take over land in tribal areas.
To address this explosive issue, the government needs to be trusted by all sides to play an impartial role as mediator. Instead, the Singh government has stoked ethnic divides with policy decisions that impact Kuki forest rights, and with unfair allusions to illegal immigrants, drug trade, deforestation, and militancy that fuelled anxiety among the Meitei.
The authorities are asking for calm and restraint on all sides, but as long as there is distrust of the government, the threat of violence will persist. Survivors and families of victims need redress and accountability. The government should ensure unhindered access to humanitarian aid and the internet, take steps to demobilize and disarm abusive groups, and order an independent investigation. Mediation efforts should include all stakeholders and should be centered around ending violence and ensuring that all communities are protected."""

In [83]:
# test_text = test_text.strip()
test_text = test_text.replace("\n", " ")

In [84]:
test_text = preprocess_corpus([test_text])

In [85]:
# ## TFidf Vectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_v=TfidfVectorizer(max_features=5000,ngram_range=(1,3))
X_test_text=tfidf_v.fit_transform(test_text).toarray()

In [86]:
X_test_text.shape

(1, 592)

In [87]:
import numpy as np

# Reshape the vector
reshaped_vector = np.pad(X_test_text, ((0, 0), (0, 5000-592)), mode='constant')

# Make predictions
predictions = model.predict(reshaped_vector)



In [88]:
reshaped_vector.shape

(1, 5000)

In [89]:
y_test_text = model.predict(reshaped_vector)



In [90]:
y_test_text

array([[8.502871e-11]], dtype=float32)

In [91]:
if y_test_text > 0.5:
    print('False')
else:
    print('TRUE')

TRUE


# Saving the model

In [92]:
model.save('./model_3.h5')