# Fake and real news dataset

In [3]:
import zipfile
import pandas as pd

# Extract ZIP
with zipfile.ZipFile(r"C:\Users\jvina\Downloads\archive (1).zip", 'r') as zip_ref:
    zip_ref.extractall("Fake_and_real_news_dataset")

# Load datasets
def load_data(filename):
    return pd.read_csv(filename)

fake = load_data("Fake_and_real_news_dataset/Fake.csv")
true = load_data("Fake_and_real_news_dataset/True.csv")


fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


# check missing values

In [4]:
fake.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [5]:
true.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

# Count duplicates in dataset

In [6]:
fake.duplicated().sum()

3

In [7]:
true.duplicated().sum()

206

In [8]:
fake[fake.duplicated()]     # Show duplicate rows

Unnamed: 0,title,text,subject,date
9942,HILLARY TWEETS MESSAGE In Defense Of DACA…OOPS...,No time to waste we've got to fight with eve...,politics,"Sep 9, 2017"
11446,FORMER DEMOCRAT WARNS Young Americans: “Rioter...,"Who is silencing political speech, physically...",politics,"Mar 10, 2017"
14925,[VIDEO] #BlackLivesMatter Terrorists Storm Dar...,They were probably just looking for a safe sp...,politics,"Nov 16, 2015"


# Remove duplicates

In [9]:
fake = fake.drop_duplicates()
true = true.drop_duplicates()

#   Preprocess text

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df):
    df['title'] = df['title'].str.lower()
    df['text'] = df['text'].str.lower()
    return df

fake = preprocess_data(fake)
true = preprocess_data(true)

# Combine datasets
fake['label'] = 0   # Fake news = 0
true['label'] = 1   # True news = 1

data = pd.concat([fake, true])

# Combine title and text into one column
data['content'] = data['title'] + " " + data['text']

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(data['content'])

# Labels
y = data['label']

# Train-Test Split (Same for All Models)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, y_train)
log_reg_pred = log_reg.predict(X_test)

# Naive Bayes (MultinomialNB for text)

In [13]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

# Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Evaluate and Compare Models

In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
     "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"{name}\nAccuracy: {acc}\nPrecision: {prec}\nRecall: {rec}\nF1 Score: {f1}\n")

Logistic Regression
Accuracy: 0.9878048780487805
Precision: 0.9848305285612704
Recall: 0.9892857142857143
F1 Score: 0.9870530941917093

Naive Bayes
Accuracy: 0.9346609979861267
Precision: 0.9346153846153846
Recall: 0.9257142857142857
F1 Score: 0.9301435406698565

Random Forest
Accuracy: 0.9976504810919669
Precision: 0.9980929678188319
Recall: 0.996904761904762
F1 Score: 0.9974985110184633



In [15]:
import pickle

# Example: If Logistic Regression performed best
best_model = LogisticRegression(max_iter=200)
best_model.fit(X_train, y_train)

# Save the trained model
with open("model.pkl", "wb") as f:
    pickle.dump(best_model, f)

# Save the TF-IDF vectorizer
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
