In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import string
from nltk.stem import PorterStemmer
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier


In [2]:
df=pd.read_csv("../Data/fake_news_dataset.csv")

df["full_text"] = df["title"].fillna('') + " " + df["text"].fillna('')

df[["label", "full_text"]].head()


Unnamed: 0,label,full_text
0,real,Foreign Democrat final. more tax development b...
1,fake,To offer down resource great point. probably g...
2,fake,Himself church myself carry. them identify for...
3,fake,You unit its should. phone which item yard Rep...
4,fake,Billion believe employee summer how. wonder my...


In [3]:
df['label'] = df['label'].astype(str).str.strip().str.lower()
df["label"] = df["label"].map({"fake": 0, "real": 1})

In [4]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [5]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens if word not in stop_words
    ]
    return " ".join(tokens)

documents = df["full_text"].apply(clean_text).tolist()
df["cleaned_text"] = documents


In [6]:
X = df["cleaned_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
vectorizer_all = CountVectorizer(max_features=5000)
X_all = vectorizer_all.fit_transform(documents)
vocab = vectorizer_all.get_feature_names_out()
X_array = X_all.toarray()
labels = y.tolist()

In [8]:
fake_indices = [i for i, label in enumerate(labels) if label == 0]
real_indices = [i for i, label in enumerate(labels) if label == 1]

In [9]:
fake_mean = np.mean(X_array[fake_indices], axis=0)
real_mean = np.mean(X_array[real_indices], axis=0)

In [10]:
word_scores = fake_mean - real_mean
df_keywords = pd.DataFrame({
    "word": vocab,
    "fake_score": word_scores
}).sort_values(by="fake_score", ascending=False)

In [15]:
threshold = 0.023
important_words = df_keywords[df_keywords["fake_score"] > threshold]["word"].tolist()

In [None]:
vectorizer_selected = CountVectorizer(vocabulary=important_words)
X_selected = vectorizer_selected.fit_transform(documents)

In [38]:
vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(df["cleaned_text"]) 


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "SVM (Linear)": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{name}\n Accuracy: {acc:.4f}\n F1 Score: {f1:.4f}\n" + "-"*40)


Logistic Regression
 Accuracy: 0.5012
 F1 Score: 0.4928
----------------------------------------
Naive Bayes
 Accuracy: 0.5100
 F1 Score: 0.4528
----------------------------------------
SVM (Linear)
 Accuracy: 0.5005
 F1 Score: 0.4921
----------------------------------------
Random Forest
 Accuracy: 0.5002
 F1 Score: 0.4752
----------------------------------------
