<a href="https://colab.research.google.com/github/aaradhya466/AI_Driven_Fake_News_Detection/blob/main/FAKE_NEWS_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
# Loading dataset using pd.read_csv()
path = "/content/drive/MyDrive/Dataset/Fake.csv"
path2 = "/content/drive/MyDrive/Dataset/True.csv"
df_fake = pd.read_csv(path)
df_true = pd.read_csv(path2)

In [None]:
# Clean up data - Removing Reuters and adding a target column where target value of fake news is 0 and real news is 1
df_true["text"] = df_true["text"].replace("(Reuters)", "", regex=True)
df_fake["target"] = 0
df_true["target"] = 1


In [None]:
# Drop unnecessary columns
df_fake = df_fake.drop(["title", "subject", "date"], axis=1)
df_true = df_true.drop(["title", "subject", "date"], axis=1)

In [None]:
# Combine datasets into one dataframe and randomly shuffle the data , reset the index
df = pd.concat([df_fake, df_true], axis=0).sample(frac=1).reset_index(drop=True)


In [None]:
# Text cleaning function
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[()]', '', text)
    text = re.sub('\\W', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


In [None]:
# Apply text cleaning on the data
df["text"] = df["text"].apply(wordopt)

In [None]:
# Split data using train_test_split() . 75% of data for training data and 25% for testing
X = df["text"]
Y = df["target"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)


In [None]:
# Vectorization
vectorization = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Adjusted parameters
xv_train = vectorization.fit_transform(X_train)
xv_test = vectorization.transform(X_test)

In [None]:
# Define all the five machine learning models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [None]:
# Train and evaluate models
for name, model in models.items():
    model.fit(xv_train, Y_train)
    score = model.score(xv_test, Y_test)
    print(f"The Accuracy of the {name} Model is {score:.4f}") # Model score
    print(classification_report(Y_test, model.predict(xv_test))) # Classification Report


The Accuracy of the Logistic Regression Model is 0.9848
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5919
           1       0.98      0.99      0.98      5306

    accuracy                           0.98     11225
   macro avg       0.98      0.98      0.98     11225
weighted avg       0.98      0.98      0.98     11225

The Accuracy of the Decision Tree Model is 0.9589
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      5919
           1       0.97      0.95      0.96      5306

    accuracy                           0.96     11225
   macro avg       0.96      0.96      0.96     11225
weighted avg       0.96      0.96      0.96     11225

The Accuracy of the Random Forest Model is 0.9889
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5919
           1       0.99      0.99      0.99      5306

    accuracy              

Parameters: { "use_label_encoder" } are not used.



The Accuracy of the XGBoost Model is 0.9920
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5919
           1       0.99      0.99      0.99      5306

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [None]:
# Manual testing function
def output_label(n):
    return "Fake News" if n == 0 else "Not A Fake News"

def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)

    predictions = {name: model.predict(new_xv_test)[0] for name, model in models.items()}
    results = {name: output_label(pred) for name, pred in predictions.items()}

    return results

In [None]:
news = str(input("Enter news headline: "))
predictions = manual_testing(news)
for model, prediction in predictions.items():
    print(f"{model} Prediction: {prediction}")