In [3]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')
df = pd.read_csv("/content/drive/MyDrive/AI_news_dataset/Fake_Real.csv")
print(df.head())


Mounted at /content/drive
   Unnamed: 0                                               text  \
0           0  new york reuters us environmental group sierra...   
1           1  washington reuters us air force asked industry...   
2           2  saturday paul ryan posted photo instagram phot...   
3           3  america keeps waiting word hillary indicted ob...   
4           4                   religion peace ht weasel zippers   

        subject target  
0  politicsNews   True  
1  politicsNews   True  
2          News   Fake  
3      politics   Fake  
4     left-news   Fake  


In [7]:
print(df['input_text'].isna().sum())


632


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib

print("Original dataset shape:", df.shape)
print(df.head())

df['input_text'] = df['subject'].fillna('') + " " + df['text'].fillna('')

df = df[df['input_text'].str.strip() != '']

print("Cleaned dataset shape:", df.shape)

# Features and labels
X = df['input_text']
y = df['target']  # 0 = True, 1 = Fake

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


def train_evaluate(model, X_train_vec, y_train, X_test_vec, y_test, model_name):
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    print(f"===== {model_name} =====")
    print("Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))
    print("\n")
    return model

# Logistic Regression
lr_model = train_evaluate(LogisticRegression(max_iter=1000), X_train_vec, y_train, X_test_vec, y_test, "Logistic Regression")

# Random Forest
rf_model = train_evaluate(RandomForestClassifier(n_estimators=100, random_state=42), X_train_vec, y_train, X_test_vec, y_test, "Random Forest")

# Naive Bayes
nb_model = train_evaluate(MultinomialNB(), X_train_vec, y_train, X_test_vec, y_test, "Naive Bayes")

import os
os.makedirs("models", exist_ok=True)

joblib.dump(lr_model, "models/logistic_regression.pkl")
joblib.dump(rf_model, "models/random_forest.pkl")
joblib.dump(nb_model, "models/naive_bayes.pkl")
joblib.dump(tfidf, "models/tfidf_vectorizer.pkl")

print("Models and vectorizer saved in 'models/' folder.")


Original dataset shape: (44898, 5)
   Unnamed: 0                                               text  \
0           0  new york reuters us environmental group sierra...   
1           1  washington reuters us air force asked industry...   
2           2  saturday paul ryan posted photo instagram phot...   
3           3  america keeps waiting word hillary indicted ob...   
4           4                   religion peace ht weasel zippers   

        subject target                                         input_text  
0  politicsNews   True  politicsNews new york reuters us environmental...  
1  politicsNews   True  politicsNews washington reuters us air force a...  
2          News   Fake  News saturday paul ryan posted photo instagram...  
3      politics   Fake  politics america keeps waiting word hillary in...  
4     left-news   Fake         left-news religion peace ht weasel zippers  
Cleaned dataset shape: (44898, 5)
===== Logistic Regression =====
Accuracy: 0.9922048997772829
     