# Fake news Detection

### Importing required library
Here I am importing some of the required library, if extra library is required to install It will be install later on.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import re
import string

### Inserting fake and real dataset

In [None]:
df_fake = pd.read_csv("datasets/Fake.csv")
df_true = pd.read_csv("datasets/True.csv")
df_true2 =pd.read_csv("datasets/News.csv")

In [None]:
df_fake.head(5)

In [None]:
df_true.head(5)

Inserting a column called "label" for fake and real news dataset to categories fake and true news. 

In [None]:
df_fake["label"] = 0
df_true["label"] = 1
df_true.head()

In [None]:
df_fake.shape, df_true.shape, df_true2.shape

Merging the manual testing dataframe in single dataset and save it in a csv file

Merging the main fake and true dataframe

In [None]:
df_marge = pd.concat([df_fake, df_true, df_true2], axis =0 )
df_marge.head(10)

In [None]:
df_marge.columns

#### "title",  "subject" and "date" columns is not required for detecting the fake news, so I am going to drop the columns.
## Final Dataset is: df

In [None]:

df = df_marge.drop(["title", "subject","date"], axis = 1)

In [None]:
df.isnull().sum()

In [None]:
df.size

In [None]:
df = df.dropna(axis=0, subset=['text'])
df.size

#### Randomly shuffling the dataframe 

In [None]:
df = df.sample(frac = 1)
df.head()

In [None]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)
df.columns


In [None]:
df.head()

#### Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [None]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [None]:
df["text"] = df["text"].apply(wordopt)

#### Defining dependent and independent variable as x and y

In [None]:
X = df["text"]
Y = df["label"]

#### Convert text to vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
X = vectorization.fit_transform(X)

#### Splitting the dataset into training set and testing set. 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=7)

### 1. Logistic Regression
##### Logistic Regression is a simple linear model that is commonly used for binary classification problems, including fake news detection. It models the relationship between the independent variables and the probability of the binary outcome.

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train,y_train)
pred_lr=LR.predict(x_test)
score = LR.score(x_test, y_test)
# score=accuracy_score(y_test,pred_lr)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_lr))
confusion_matrix(y_test,pred_lr, labels=[0,1])

### 2. Passive Aggressive Classifier
##### PAC is an online learning algorithm, which means it can adapt to changes in the distribution of fake news as new data is received. It works by iteratively updating a linear classifier based on the features of the news articles.

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
PAC = PassiveAggressiveClassifier(max_iter=1000)
PAC.fit(x_train, y_train)
pred_pac = PAC.predict(x_test)
score = PAC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_pac))

### 3. k-Nearest Neighbors (KNN)
##### The k-Nearest Neighbors (KNN) algorithm can be used for fake news detection by treating each news article as a vector of features and labels, and finding the k nearest neighbors to a new article based on its feature vector. The label of the new article is then predicted based on the labels of its k nearest neighbors.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=5)
KNN.fit(x_train, y_train)
pred_knn = KNN.predict(x_test)
score = KNN.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_knn))

### 4. Naive Bayes Classifier
##### Naive Bayes is a simple but effective model for text classification problems, including fake news detection. It assumes independence between features and can handle large amounts of data efficiently.

In [None]:
from sklearn.naive_bayes import MultinomialNB
NBC = MultinomialNB()
NBC.fit(x_train, y_train)
pred_nbc = NBC.predict(x_test)
score = NBC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_nbc))

### 5. Decision Tree Classifier
##### Decision Trees are a simple but powerful model for classification problems. They work by recursively splitting the data based on the most significant feature until all of the instances in a leaf node belong to the same class.

In [None]:
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier()
DTC.fit(x_train, y_train)
pred_dt = DTC.predict(x_test)
score = DTC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_dt))

### 6. Random Forest Classifier
##### Random Forest is an ensemble model that uses multiple decision trees to make predictions. It is robust to overfitting and can handle large amounts of data effectively.

In [None]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=0)
RFC.fit(x_train, y_train)
pred_rfc = RFC.predict(x_test)
score = RFC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_rfc))

### 7. Gradient Boosting Classifier
##### Gradient Boosting is another ensemble model that uses decision trees as weak learners. It can handle complex data distributions and is often used for text classification problems, including fake news detection.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(x_train, y_train)
pred_gbc = GBC.predict(x_test)
score=GBC.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_gbc))

### 8. XGBoost
##### XGBoost is an optimized version of the gradient boosting algorithm that has been shown to achieve state-of-the-art results on many machine learning tasks, including fake news detection. It uses gradient boosting with decision trees as weak learners to make predictions.

In [None]:
import xgboost as xgb
XGB = xgb.XGBClassifier()
XGB.fit(x_train, y_train)
pred_xgb = XGB.predict(x_test)
score = XGB.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_xgb))

### 9. LightGBM
##### LightGBM is another gradient boosting algorithm that uses decision trees as weak learners. It is designed to handle large amounts of data and has been shown to achieve fast training times and good performance on a variety of tasks, including fake news detection.

In [None]:
import lightgbm as lgb
LGB = lgb.LGBMClassifier()
LGB.fit(x_train, y_train)
pred_lgb = LGB.predict(x_test)
score = XGB.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_lgb))

### 10.Support Vector Machine (SVM)
##### SVM is a powerful model for text classification problems. It works well with high-dimensional data and can handle non-linear relationships between features and labels.

In [None]:
from sklearn.svm import SVC
SVM = SVC(kernel='linear', C=1, random_state=0)
SVM.fit(x_train, y_train)
pred_svm = SVM.predict(x_test)
score = SVM.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_svm))

### 11. Multilayer Perceptron (MLP)
##### MLP is a type of artificial neural network that can be used for binary or multiclass classification problems, including fake news detection. It consists of multiple hidden layers of artificial neurons that process the input data and make predictions.

In [None]:
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, alpha=1e-4, solver='sgd', verbose=False, tol=1e-4, random_state=1, learning_rate_init=.1)
MLP.fit(x_train, y_train)
pred_mlp = MLP.predict(x_test)
score = MLP.score(x_test, y_test)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, pred_mlp))

### Export the Models

In [None]:
import joblib
joblib.dump(vectorization, "models/vectorizer.pkl")
joblib.dump(LR, "models/LR.pkl")
joblib.dump(PAC, "models/PAC.pkl")
joblib.dump(KNN, "models/KNN.pkl")
joblib.dump(NBC, "models/NBC.pkl")
joblib.dump(DTC, "models/DTC.pkl")
joblib.dump(RFC, "models/RFC.pkl")
joblib.dump(GBC, "models/GBC.pkl")
joblib.dump(XGB, "models/XGB.pkl")
joblib.dump(LGB, "models/LGB.pkl")
joblib.dump(SVM, "models/SVM.pkl")
joblib.dump(MLP, "models/MLP.pkl")

# Model Testing With Manual Entry

### News

In [None]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_PAC = PAC.predict(new_xv_test)
    pred_KNN = KNN.predict(new_xv_test)
    pred_NBC = NBC.predict(new_xv_test)
    pred_DTC = DTC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_XGB = XGB.predict(new_xv_test)
    pred_LGB = LGB.predict(new_xv_test)
    pred_SVM = SVM.predict(new_xv_test)
    pred_MLP = MLP.predict(new_xv_test)

    return print("\n\nLR Prediction: {} \nPAC Prediction: {} \nKNN Prediction: {} \nNBC Prediction: {}\nDT Prediction: {} \nRFC Prediction: {} \nGBC Prediction: {} \nXGB Prediction: {} \nLGB Prediction: {} \nSVM Prediction: {} \nMLP Prediction: {}"
    .format(output_lable(pred_LR[0]),
            output_lable(pred_PAC[0]),
            output_lable(pred_KNN[0]),
            output_lable(pred_NBC[0]),
            output_lable(pred_DTC[0]), 
            output_lable(pred_RFC[0]),
            output_lable(pred_GBC[0]), 
            output_lable(pred_XGB[0]),
            output_lable(pred_LGB[0]),
            output_lable(pred_SVM[0]), 
            output_lable(pred_MLP[0]),
            ))

In [None]:
news = str(input())
manual_testing(news)