# Fake news Detection

### Importing required library
Here I am importing some of the required library, if extra library is required to install It will be install later on.

In [452]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import re
import string

### Read all the available datasets

In [453]:
df_true = pd.read_csv("datasets/True.csv")
df_fake = pd.read_csv("datasets/Fake.csv")
df_combined = pd.read_csv("datasets/Combined.csv")
df_api =pd.read_csv("datasets/ApiNews.csv")

Inserting a column called "label" for fake and real news dataset to categories fake and true news. 

In [454]:
df_fake["label"] = 0
df_true["label"] = 1

In [455]:
df_true.head(5)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [456]:
df_fake.head(5)

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [457]:
df_combined.head(5)

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1


In [458]:
# Merge the title and text of the api news because api news has small text
df_api['text']=df_api['title'].fillna('') +" " + df_api['text'].fillna('')
df_api.head(5)

Unnamed: 0,title,text,subject,date,label
0,"Averse to certain foods? Beware, you could be ...","Averse to certain foods? Beware, you could be ...",general,2023-02-12,1
1,Interplanetary space station “Luna-25” is read...,Interplanetary space station “Luna-25” is read...,general,2023-02-12,1
2,Bigg Boss 16 Winner: MC Stan lifts the trophy ...,Bigg Boss 16 Winner: MC Stan lifts the trophy ...,general,2023-02-12,1
3,"US Shuts Airspace Over Lake Michigan, Cites ""N...","US Shuts Airspace Over Lake Michigan, Cites ""N...",general,2023-02-12,1
4,Google search chief warns against `hallucinati...,Google search chief warns against `hallucinati...,general,2023-02-12,1


In [459]:
df_true.shape, df_fake.shape, df_combined.shape, df_api.shape

((21417, 5), (23481, 5), (20133, 3), (8009, 5))

#### Merging All the Datasets

In [460]:
df_merge = pd.concat([df_true, df_fake, df_combined,df_api], axis =0 )

In [461]:
print(df_merge.columns)
df_merge.shape

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')


(73040, 5)

### Data Preprocessing

##### Drop rows with null values

In [462]:
df_merge['text'].isna().sum()

0

In [463]:
df_merge = df_merge.dropna(subset=['text'],axis=0)
df_merge.shape

(73040, 5)

##### Drop duplicates row

In [464]:
df_merge.duplicated(['text']).sum()

6530

In [465]:
df_merge.drop_duplicates(['text'],inplace=True)
df_merge.shape

(66510, 5)

#### Randomly shuffling the dataframe 

In [466]:
df_merge = df_merge.sample(frac = 1)
df_merge.head()

Unnamed: 0,title,text,subject,date,label
509,"Villarreal vs Barcelona, La Liga: Match Thread...","Villarreal vs Barcelona, La Liga: Match Thread...",sports,2023-02-12,1
1706,Family of Toronto man allegedly killed by teen...,Family of Toronto man allegedly killed by teen...,attack,2023-01-20,1
11250,TOO FUNNY! CHUCK SCHUMER Tries To Lead Liberal...,"In recent weeks, Senate Minority Leader Chuck ...",politics,"Mar 30, 2017",0
15967,"Skiing the Alps, Making It My Own - The New Yo...",One more run. We had to take one more run from...,,,0
320,Some Lawmakers Now Look to Bipartisanship on H...,WASHINGTON — The sudden death of legislatio...,,,0


In [467]:
df_merge.reset_index(inplace = True)
df_merge.drop(["index"], axis = 1, inplace = True)
df_merge.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [468]:
df_merge.head()

Unnamed: 0,title,text,subject,date,label
0,"Villarreal vs Barcelona, La Liga: Match Thread...","Villarreal vs Barcelona, La Liga: Match Thread...",sports,2023-02-12,1
1,Family of Toronto man allegedly killed by teen...,Family of Toronto man allegedly killed by teen...,attack,2023-01-20,1
2,TOO FUNNY! CHUCK SCHUMER Tries To Lead Liberal...,"In recent weeks, Senate Minority Leader Chuck ...",politics,"Mar 30, 2017",0
3,"Skiing the Alps, Making It My Own - The New Yo...",One more run. We had to take one more run from...,,,0
4,Some Lawmakers Now Look to Bipartisanship on H...,WASHINGTON — The sudden death of legislatio...,,,0


#### Exporting 2000 news from the dataframe for manual testing
##### These news will not be used in training the model it will be dropped from the dataframe on exported

In [469]:
#Adding the first 2000 data to the testData.json file for manual testing
testData = df_merge.head(2000)
#Removing the first 5000 data from the datasets
df_merge = df_merge.iloc[2000:,]
#Exporting data as a json file
testData.reset_index(drop=True,inplace=True)
testData.sort_index(inplace=True)
testData.to_json("client/src/api/testData.json",orient ='records')
print("testData.json exported to client/src/api folder")
print(df_merge.shape)

testData.json exported to client/src/api folder
(64510, 5)


#### "title",  "subject" and "date" columns is not required for detecting the fake news, so I am going to drop the columns.
## Final Dataset is: df

In [470]:

df = df_merge.drop(["title", "subject","date"], axis = 1)
df.columns

Index(['text', 'label'], dtype='object')

In [471]:
df.head()

Unnamed: 0,text,label
2000,Islamic State jihadis fighting to establish a ...,0
2001,It was first noticed as a bit of meteorologica...,0
2002,After Trump s latest Twitter rant over the per...,0
2003,It s no secret that Russia was behind the elec...,0
2004,WASHINGTON (Reuters) - The U.S. Senate passed ...,1


#### Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [472]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [473]:
df["text"] = df["text"].apply(wordopt)

#### Lemmatization
##### Lemmatization is the process of reducing words to their base or root form, which can help to group together words with similar meanings and reduce the number of unique words in a dataset. 

In [474]:
import nltk
from nltk.stem import WordNetLemmatizer
# Download necessary resources for tokenization and lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
# Create a lemmatizer object
lemmatizer = WordNetLemmatizer()
# Define a function to lemmatize a single word
def lemmatize_word(word):
    return lemmatizer.lemmatize(word)
# Define a function to lemmatize a list of words
def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    # lemmatized_words = [lemmatize_word(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    return ' '.join(lemmatized_words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [475]:
# Apply the lemmatization function to the 'text' column of the DataFrame
df['text'] = df['text'].apply(lemmatize_text)

#### Defining dependent and independent variable as x and y

In [476]:
X = df["text"]
Y = df["label"]

#### Convert text to vectors to train the models

In [477]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
X = vectorization.fit_transform(X)

#### Splitting the dataset into training set and testing set. 

In [478]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=7)

### 1. Logistic Regression
##### Logistic Regression is a simple linear model that is commonly used for binary classification problems, including fake news detection. It models the relationship between the independent variables and the probability of the binary outcome.

In [479]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train,y_train)
pred_lr=LR.predict(x_test)
score = LR.score(x_test, y_test)
# score=accuracy_score(y_test,pred_lr)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_lr))
confusion_matrix(y_test,pred_lr, labels=[0,1])

Accuracy: 91.94%
              precision    recall  f1-score   support

           0       0.92      0.88      0.90      5261
           1       0.92      0.94      0.93      7641

    accuracy                           0.92     12902
   macro avg       0.92      0.91      0.92     12902
weighted avg       0.92      0.92      0.92     12902



array([[4642,  619],
       [ 421, 7220]], dtype=int64)

### 2. Passive Aggressive Classifier
##### PAC is an online learning algorithm, which means it can adapt to changes in the distribution of fake news as new data is received. It works by iteratively updating a linear classifier based on the features of the news articles.

In [480]:
from sklearn.linear_model import PassiveAggressiveClassifier
PAC = PassiveAggressiveClassifier(max_iter=1000)
PAC.fit(x_train, y_train)
pred_pac = PAC.predict(x_test)
score = PAC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_pac))

Accuracy: 91.84%
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      5261
           1       0.93      0.93      0.93      7641

    accuracy                           0.92     12902
   macro avg       0.92      0.92      0.92     12902
weighted avg       0.92      0.92      0.92     12902



### 3. k-Nearest Neighbors (KNN)
##### The k-Nearest Neighbors (KNN) algorithm can be used for fake news detection by treating each news article as a vector of features and labels, and finding the k nearest neighbors to a new article based on its feature vector. The label of the new article is then predicted based on the labels of its k nearest neighbors.

In [481]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=5)
KNN.fit(x_train, y_train)
pred_knn = KNN.predict(x_test)
score = KNN.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_knn))

Accuracy: 62.77%
              precision    recall  f1-score   support

           0       0.54      0.56      0.55      5261
           1       0.69      0.67      0.68      7641

    accuracy                           0.63     12902
   macro avg       0.62      0.62      0.62     12902
weighted avg       0.63      0.63      0.63     12902



### 4. Naive Bayes Classifier
##### Naive Bayes is a simple but effective model for text classification problems, including fake news detection. It assumes independence between features and can handle large amounts of data efficiently.

In [482]:
from sklearn.naive_bayes import MultinomialNB
NBC = MultinomialNB()
NBC.fit(x_train, y_train)
pred_nbc = NBC.predict(x_test)
score = NBC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_nbc))

Accuracy: 82.62%
              precision    recall  f1-score   support

           0       0.79      0.77      0.78      5261
           1       0.85      0.86      0.85      7641

    accuracy                           0.83     12902
   macro avg       0.82      0.82      0.82     12902
weighted avg       0.83      0.83      0.83     12902



### 5. Decision Tree Classifier
##### Decision Trees are a simple but powerful model for classification problems. They work by recursively splitting the data based on the most significant feature until all of the instances in a leaf node belong to the same class.

In [483]:
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier()
DTC.fit(x_train, y_train)
pred_dt = DTC.predict(x_test)
score = DTC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_dt))

Accuracy: 90.1%
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      5261
           1       0.92      0.91      0.92      7641

    accuracy                           0.90     12902
   macro avg       0.90      0.90      0.90     12902
weighted avg       0.90      0.90      0.90     12902



### 6. Random Forest Classifier
##### Random Forest is an ensemble model that uses multiple decision trees to make predictions. It is robust to overfitting and can handle large amounts of data effectively.

In [484]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=0)
RFC.fit(x_train, y_train)
pred_rfc = RFC.predict(x_test)
score = RFC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_rfc))

Accuracy: 88.36%
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      5261
           1       0.91      0.89      0.90      7641

    accuracy                           0.88     12902
   macro avg       0.88      0.88      0.88     12902
weighted avg       0.88      0.88      0.88     12902



### 7. Gradient Boosting Classifier
##### Gradient Boosting is another ensemble model that uses decision trees as weak learners. It can handle complex data distributions and is often used for text classification problems, including fake news detection.

In [485]:
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(x_train, y_train)
pred_gbc = GBC.predict(x_test)
score=GBC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_gbc))

Accuracy: 91.59%
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      5261
           1       0.93      0.93      0.93      7641

    accuracy                           0.92     12902
   macro avg       0.91      0.91      0.91     12902
weighted avg       0.92      0.92      0.92     12902



### 8. XGBoost
##### XGBoost is an optimized version of the gradient boosting algorithm that has been shown to achieve state-of-the-art results on many machine learning tasks, including fake news detection. It uses gradient boosting with decision trees as weak learners to make predictions.

In [486]:
import xgboost as xgb
XGB = xgb.XGBClassifier()
XGB.fit(x_train, y_train)
pred_xgb = XGB.predict(x_test)
score = XGB.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_xgb))

Accuracy: 94.93%
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      5261
           1       0.95      0.96      0.96      7641

    accuracy                           0.95     12902
   macro avg       0.95      0.95      0.95     12902
weighted avg       0.95      0.95      0.95     12902



### 9. LightGBM
##### LightGBM is another gradient boosting algorithm that uses decision trees as weak learners. It is designed to handle large amounts of data and has been shown to achieve fast training times and good performance on a variety of tasks, including fake news detection.

In [487]:
import lightgbm as lgb
LGB = lgb.LGBMClassifier()
LGB.fit(x_train, y_train)
pred_lgb = LGB.predict(x_test)
score = XGB.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_lgb))

Accuracy: 94.93%
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      5261
           1       0.96      0.96      0.96      7641

    accuracy                           0.95     12902
   macro avg       0.95      0.95      0.95     12902
weighted avg       0.95      0.95      0.95     12902



### 10.Support Vector Machine (SVM)
##### SVM is a powerful model for text classification problems. It works well with high-dimensional data and can handle non-linear relationships between features and labels.

In [488]:
from sklearn.svm import SVC
SVM = SVC(kernel='linear', C=1, random_state=0)
SVM.fit(x_train, y_train)
pred_svm = SVM.predict(x_test)
score = SVM.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_svm))

Accuracy: 93.29%
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      5261
           1       0.94      0.95      0.94      7641

    accuracy                           0.93     12902
   macro avg       0.93      0.93      0.93     12902
weighted avg       0.93      0.93      0.93     12902



### 11. Multilayer Perceptron (MLP)
##### MLP is a type of artificial neural network that can be used for binary or multiclass classification problems, including fake news detection. It consists of multiple hidden layers of artificial neurons that process the input data and make predictions.

In [489]:
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, alpha=1e-4, solver='sgd', verbose=False, tol=1e-4, random_state=1, learning_rate_init=.1)
MLP.fit(x_train, y_train)
pred_mlp = MLP.predict(x_test)
score = MLP.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_mlp))

Accuracy: 92.66%
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      5261
           1       0.94      0.93      0.94      7641

    accuracy                           0.93     12902
   macro avg       0.92      0.93      0.92     12902
weighted avg       0.93      0.93      0.93     12902



### Export the Models

In [490]:
import joblib
joblib.dump(vectorization, "models/vectorizer.pkl")
joblib.dump(LR, "models/LR.pkl")
joblib.dump(PAC, "models/PAC.pkl")
joblib.dump(KNN, "models/KNN.pkl")
joblib.dump(NBC, "models/NBC.pkl")
joblib.dump(DTC, "models/DTC.pkl")
joblib.dump(RFC, "models/RFC.pkl")
joblib.dump(GBC, "models/GBC.pkl")
joblib.dump(XGB, "models/XGB.pkl")
joblib.dump(LGB, "models/LGB.pkl")
joblib.dump(SVM, "models/SVM.pkl")
joblib.dump(MLP, "models/MLP.pkl")

['models/MLP.pkl']

# Model Testing With Manual Entry

### News

In [491]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_def_test["text"] = new_def_test["text"].apply(lemmatize_text) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_PAC = PAC.predict(new_xv_test)
    pred_KNN = KNN.predict(new_xv_test)
    pred_NBC = NBC.predict(new_xv_test)
    pred_DTC = DTC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_XGB = XGB.predict(new_xv_test)
    pred_LGB = LGB.predict(new_xv_test)
    pred_SVM = SVM.predict(new_xv_test)
    pred_MLP = MLP.predict(new_xv_test)

    return print("\n\nLR Prediction: {} \nPAC Prediction: {} \nKNN Prediction: {} \nNBC Prediction: {}\nDT Prediction: {} \nRFC Prediction: {} \nGBC Prediction: {} \nXGB Prediction: {} \nLGB Prediction: {} \nSVM Prediction: {} \nMLP Prediction: {}"
    .format(output_lable(pred_LR[0]),
            output_lable(pred_PAC[0]),
            output_lable(pred_KNN[0]),
            output_lable(pred_NBC[0]),
            output_lable(pred_DTC[0]), 
            output_lable(pred_RFC[0]),
            output_lable(pred_GBC[0]), 
            output_lable(pred_XGB[0]),
            output_lable(pred_LGB[0]),
            output_lable(pred_SVM[0]), 
            output_lable(pred_MLP[0]),
            ))

In [492]:
news = str(input())
manual_testing(news)



LR Prediction: Not A Fake News 
PAC Prediction: Not A Fake News 
KNN Prediction: Fake News 
NBC Prediction: Not A Fake News
DT Prediction: Fake News 
RFC Prediction: Fake News 
GBC Prediction: Not A Fake News 
XGB Prediction: Not A Fake News 
LGB Prediction: Not A Fake News 
SVM Prediction: Not A Fake News 
MLP Prediction: Not A Fake News
