# Fake news Detection

### Importing required library
Here I am importing some of the required library, if extra library is required to install It will be install later on.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import re
import string

### Read all the available datasets

In [2]:
df_true = pd.read_csv("datasets/True.csv")
df_fake = pd.read_csv("datasets/Fake.csv")
df_combined = pd.read_csv("datasets/Combined.csv")
df_api =pd.read_csv("datasets/ApiNews.csv")

Inserting a column called "label" for fake and real news dataset to categories fake and true news. 

In [3]:
df_fake["label"] = 0
df_true["label"] = 1

In [4]:
df_true.head(5)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [5]:
df_fake.head(5)

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [6]:
df_combined.head(5)

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
# Merge the title and text of the api news because api news has small text
df_api['text']=df_api['title'].fillna('') +" " + df_api['text'].fillna('')
df_api.head(5)

Unnamed: 0,title,text,subject,date,label
0,"Averse to certain foods? Beware, you could be ...","Averse to certain foods? Beware, you could be ...",general,2023-02-12,1
1,Interplanetary space station “Luna-25” is read...,Interplanetary space station “Luna-25” is read...,general,2023-02-12,1
2,Bigg Boss 16 Winner: MC Stan lifts the trophy ...,Bigg Boss 16 Winner: MC Stan lifts the trophy ...,general,2023-02-12,1
3,"US Shuts Airspace Over Lake Michigan, Cites ""N...","US Shuts Airspace Over Lake Michigan, Cites ""N...",general,2023-02-12,1
4,Google search chief warns against `hallucinati...,Google search chief warns against `hallucinati...,general,2023-02-12,1


In [8]:
df_true.shape, df_fake.shape, df_combined.shape, df_api.shape

((21417, 5), (23481, 5), (20133, 3), (8009, 5))

#### Merging All the Datasets

In [9]:
df_marge = pd.concat([df_true, df_fake, df_combined,df_api], axis =0 )

In [10]:
df_marge.shape

(73040, 5)

#### "title",  "subject" and "date" columns is not required for detecting the fake news, so I am going to drop the columns.
## Final Dataset is: df

In [11]:

df = df_marge.drop(["title", "subject","date"], axis = 1)
df.columns

Index(['text', 'label'], dtype='object')

### Data Preprocessing

##### Drop duplicates row

In [12]:
df.duplicated().sum()

6528

In [13]:
df.drop_duplicates(inplace=True)
df.shape

(66512, 2)

##### Drop rows with null values

In [14]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [15]:
df = df.dropna(axis=0)
df.shape

(66512, 2)

In [16]:
df.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


#### Randomly shuffling the dataframe 

In [17]:
df = df.sample(frac = 1)
df.head()

Unnamed: 0,text,label
15586,ANKARA (Reuters) - Turkish President Tayyip Er...,1
2823,"On the campaign trail, Donald Trump said he wa...",0
3019,"The City of Biloxi, Mississippi tweeted out a ...",0
16192,"REYKJAVIK (Reuters) - Icelanders, angry over a...",1
15646,DUBAI (Reuters) - Saudi Arabian investment fir...,1


In [18]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)
df.columns


Index(['text', 'label'], dtype='object')

In [19]:
df.head(10)

Unnamed: 0,text,label
0,ANKARA (Reuters) - Turkish President Tayyip Er...,1
1,"On the campaign trail, Donald Trump said he wa...",0
2,"The City of Biloxi, Mississippi tweeted out a ...",0
3,"REYKJAVIK (Reuters) - Icelanders, angry over a...",1
4,DUBAI (Reuters) - Saudi Arabian investment fir...,1
5,"Trump rape accuser skips press conference, cit...",1
6,Russia strikes Ukraine's cities hours after We...,1
7,President Donald Trump said Wednesday leftwing...,0
8,WASHINGTON (Reuters) - There is “simply no pla...,1
9,There definitely seems to be a new sense of en...,0


#### Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [20]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [21]:
df["text"] = df["text"].apply(wordopt)

#### Lemmatization
##### Lemmatization is the process of reducing words to their base or root form, which can help to group together words with similar meanings and reduce the number of unique words in a dataset. 

In [22]:
import nltk
from nltk.stem import WordNetLemmatizer
# Download necessary resources for tokenization and lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
# Create a lemmatizer object
lemmatizer = WordNetLemmatizer()
# Define a function to lemmatize a single word
def lemmatize_word(word):
    return lemmatizer.lemmatize(word)
# Define a function to lemmatize a list of words
def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    # lemmatized_words = [lemmatize_word(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    return ' '.join(lemmatized_words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# Apply the lemmatization function to the 'text' column of the DataFrame
df['text'] = df['text'].apply(lemmatize_text)

#### Defining dependent and independent variable as x and y

In [24]:
X = df["text"]
Y = df["label"]

#### Convert text to vectors to train the models

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
X = vectorization.fit_transform(X)

#### Splitting the dataset into training set and testing set. 

In [26]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=7)

### 1. Logistic Regression
##### Logistic Regression is a simple linear model that is commonly used for binary classification problems, including fake news detection. It models the relationship between the independent variables and the probability of the binary outcome.

In [27]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train,y_train)
pred_lr=LR.predict(x_test)
score = LR.score(x_test, y_test)
# score=accuracy_score(y_test,pred_lr)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_lr))
confusion_matrix(y_test,pred_lr, labels=[0,1])

Accuracy: 92.58%
              precision    recall  f1-score   support

           0       0.93      0.89      0.91      5590
           1       0.92      0.95      0.94      7713

    accuracy                           0.93     13303
   macro avg       0.93      0.92      0.92     13303
weighted avg       0.93      0.93      0.93     13303



array([[4977,  613],
       [ 374, 7339]], dtype=int64)

### 2. Passive Aggressive Classifier
##### PAC is an online learning algorithm, which means it can adapt to changes in the distribution of fake news as new data is received. It works by iteratively updating a linear classifier based on the features of the news articles.

In [28]:
from sklearn.linear_model import PassiveAggressiveClassifier
PAC = PassiveAggressiveClassifier(max_iter=1000)
PAC.fit(x_train, y_train)
pred_pac = PAC.predict(x_test)
score = PAC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_pac))

Accuracy: 92.14%
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      5590
           1       0.93      0.93      0.93      7713

    accuracy                           0.92     13303
   macro avg       0.92      0.92      0.92     13303
weighted avg       0.92      0.92      0.92     13303



### 3. k-Nearest Neighbors (KNN)
##### The k-Nearest Neighbors (KNN) algorithm can be used for fake news detection by treating each news article as a vector of features and labels, and finding the k nearest neighbors to a new article based on its feature vector. The label of the new article is then predicted based on the labels of its k nearest neighbors.

In [29]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=5)
KNN.fit(x_train, y_train)
pred_knn = KNN.predict(x_test)
score = KNN.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_knn))

Accuracy: 62.02%
              precision    recall  f1-score   support

           0       0.54      0.73      0.62      5590
           1       0.74      0.54      0.62      7713

    accuracy                           0.62     13303
   macro avg       0.64      0.64      0.62     13303
weighted avg       0.65      0.62      0.62     13303



### 4. Naive Bayes Classifier
##### Naive Bayes is a simple but effective model for text classification problems, including fake news detection. It assumes independence between features and can handle large amounts of data efficiently.

In [30]:
from sklearn.naive_bayes import MultinomialNB
NBC = MultinomialNB()
NBC.fit(x_train, y_train)
pred_nbc = NBC.predict(x_test)
score = NBC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_nbc))

Accuracy: 83.06%
              precision    recall  f1-score   support

           0       0.82      0.77      0.79      5590
           1       0.84      0.87      0.86      7713

    accuracy                           0.83     13303
   macro avg       0.83      0.82      0.82     13303
weighted avg       0.83      0.83      0.83     13303



### 5. Decision Tree Classifier
##### Decision Trees are a simple but powerful model for classification problems. They work by recursively splitting the data based on the most significant feature until all of the instances in a leaf node belong to the same class.

In [31]:
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier()
DTC.fit(x_train, y_train)
pred_dt = DTC.predict(x_test)
score = DTC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_dt))

Accuracy: 89.63%
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      5590
           1       0.91      0.91      0.91      7713

    accuracy                           0.90     13303
   macro avg       0.89      0.89      0.89     13303
weighted avg       0.90      0.90      0.90     13303



### 6. Random Forest Classifier
##### Random Forest is an ensemble model that uses multiple decision trees to make predictions. It is robust to overfitting and can handle large amounts of data effectively.

In [32]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=0)
RFC.fit(x_train, y_train)
pred_rfc = RFC.predict(x_test)
score = RFC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_rfc))

Accuracy: 88.12%
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      5590
           1       0.90      0.89      0.90      7713

    accuracy                           0.88     13303
   macro avg       0.88      0.88      0.88     13303
weighted avg       0.88      0.88      0.88     13303



### 7. Gradient Boosting Classifier
##### Gradient Boosting is another ensemble model that uses decision trees as weak learners. It can handle complex data distributions and is often used for text classification problems, including fake news detection.

In [33]:
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(x_train, y_train)
pred_gbc = GBC.predict(x_test)
score=GBC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_gbc))

Accuracy: 91.94%
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      5590
           1       0.92      0.94      0.93      7713

    accuracy                           0.92     13303
   macro avg       0.92      0.92      0.92     13303
weighted avg       0.92      0.92      0.92     13303



### 8. XGBoost
##### XGBoost is an optimized version of the gradient boosting algorithm that has been shown to achieve state-of-the-art results on many machine learning tasks, including fake news detection. It uses gradient boosting with decision trees as weak learners to make predictions.

In [34]:
import xgboost as xgb
XGB = xgb.XGBClassifier()
XGB.fit(x_train, y_train)
pred_xgb = XGB.predict(x_test)
score = XGB.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_xgb))

Accuracy: 94.77%
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      5590
           1       0.95      0.96      0.95      7713

    accuracy                           0.95     13303
   macro avg       0.95      0.95      0.95     13303
weighted avg       0.95      0.95      0.95     13303



### 9. LightGBM
##### LightGBM is another gradient boosting algorithm that uses decision trees as weak learners. It is designed to handle large amounts of data and has been shown to achieve fast training times and good performance on a variety of tasks, including fake news detection.

In [35]:
import lightgbm as lgb
LGB = lgb.LGBMClassifier()
LGB.fit(x_train, y_train)
pred_lgb = LGB.predict(x_test)
score = XGB.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_lgb))

Accuracy: 94.77%
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      5590
           1       0.96      0.96      0.96      7713

    accuracy                           0.95     13303
   macro avg       0.95      0.95      0.95     13303
weighted avg       0.95      0.95      0.95     13303



### 10.Support Vector Machine (SVM)
##### SVM is a powerful model for text classification problems. It works well with high-dimensional data and can handle non-linear relationships between features and labels.

In [36]:
from sklearn.svm import SVC
SVM = SVC(kernel='linear', C=1, random_state=0)
SVM.fit(x_train, y_train)
pred_svm = SVM.predict(x_test)
score = SVM.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_svm))

Accuracy: 93.77%
              precision    recall  f1-score   support

           0       0.94      0.91      0.93      5590
           1       0.94      0.95      0.95      7713

    accuracy                           0.94     13303
   macro avg       0.94      0.93      0.94     13303
weighted avg       0.94      0.94      0.94     13303



### 11. Multilayer Perceptron (MLP)
##### MLP is a type of artificial neural network that can be used for binary or multiclass classification problems, including fake news detection. It consists of multiple hidden layers of artificial neurons that process the input data and make predictions.

In [37]:
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, alpha=1e-4, solver='sgd', verbose=False, tol=1e-4, random_state=1, learning_rate_init=.1)
MLP.fit(x_train, y_train)
pred_mlp = MLP.predict(x_test)
score = MLP.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_mlp))

Accuracy: 93.17%
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      5590
           1       0.94      0.94      0.94      7713

    accuracy                           0.93     13303
   macro avg       0.93      0.93      0.93     13303
weighted avg       0.93      0.93      0.93     13303



### Export the Models

In [38]:
import joblib
joblib.dump(vectorization, "models/vectorizer.pkl")
joblib.dump(LR, "models/LR.pkl")
joblib.dump(PAC, "models/PAC.pkl")
joblib.dump(KNN, "models/KNN.pkl")
joblib.dump(NBC, "models/NBC.pkl")
joblib.dump(DTC, "models/DTC.pkl")
joblib.dump(RFC, "models/RFC.pkl")
joblib.dump(GBC, "models/GBC.pkl")
joblib.dump(XGB, "models/XGB.pkl")
joblib.dump(LGB, "models/LGB.pkl")
joblib.dump(SVM, "models/SVM.pkl")
joblib.dump(MLP, "models/MLP.pkl")

['models/MLP.pkl']

# Model Testing With Manual Entry

### News

In [39]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_def_test["text"] = new_def_test["text"].apply(lemmatize_text) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_PAC = PAC.predict(new_xv_test)
    pred_KNN = KNN.predict(new_xv_test)
    pred_NBC = NBC.predict(new_xv_test)
    pred_DTC = DTC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_XGB = XGB.predict(new_xv_test)
    pred_LGB = LGB.predict(new_xv_test)
    pred_SVM = SVM.predict(new_xv_test)
    pred_MLP = MLP.predict(new_xv_test)

    return print("\n\nLR Prediction: {} \nPAC Prediction: {} \nKNN Prediction: {} \nNBC Prediction: {}\nDT Prediction: {} \nRFC Prediction: {} \nGBC Prediction: {} \nXGB Prediction: {} \nLGB Prediction: {} \nSVM Prediction: {} \nMLP Prediction: {}"
    .format(output_lable(pred_LR[0]),
            output_lable(pred_PAC[0]),
            output_lable(pred_KNN[0]),
            output_lable(pred_NBC[0]),
            output_lable(pred_DTC[0]), 
            output_lable(pred_RFC[0]),
            output_lable(pred_GBC[0]), 
            output_lable(pred_XGB[0]),
            output_lable(pred_LGB[0]),
            output_lable(pred_SVM[0]), 
            output_lable(pred_MLP[0]),
            ))

In [40]:
news = str(input())
manual_testing(news)



LR Prediction: Not A Fake News 
PAC Prediction: Not A Fake News 
KNN Prediction: Not A Fake News 
NBC Prediction: Not A Fake News
DT Prediction: Fake News 
RFC Prediction: Fake News 
GBC Prediction: Not A Fake News 
XGB Prediction: Not A Fake News 
LGB Prediction: Not A Fake News 
SVM Prediction: Not A Fake News 
MLP Prediction: Not A Fake News
