In [58]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [59]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [60]:

df = pd.read_csv('/content/drive/MyDrive/archive (13)/fake_and_real_news.csv')

In [61]:
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [62]:
df['label'].value_counts(dropna=False)


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Fake,5000
Real,4900


In [63]:
df['label'].unique()


array(['Fake', 'Real'], dtype=object)

In [64]:
df['label'] = df['label'].astype(str)


In [65]:
df['label'].unique()

array(['Fake', 'Real'], dtype=object)

In [66]:
df['label'] = df['label'].map({'Fake': 0, 'Real': 1})


In [67]:
df

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,0
1,U.S. conservative leader optimistic of common ...,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",1
3,Court Forces Ohio To Allow Millions Of Illega...,0
4,Democrats say Trump agrees to work on immigrat...,1
...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,0
9896,Trump consults Republican senators on Fed chie...,1
9897,Trump lawyers say judge lacks jurisdiction for...,1
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,0


In [69]:
X = df['Text']     # 1D Series of text
y = df['label']    # numeric labels


In [70]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english'
)

X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


In [72]:
X_train_vec.shape, X_test_vec.shape

((7920, 5000), (1980, 5000))

In [73]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)


In [74]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.990909090909091
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1000
           1       0.99      0.99      0.99       980

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



In [77]:
news_text = """
The government announced a new policy to improve digital education across rural areas.
"""


In [78]:
news_vec = tfidf.transform([news_text])


In [79]:
pred = model.predict(news_vec)[0]


In [80]:
if pred == 1:
    print("ðŸŸ¢ REAL NEWS")
else:
    print("ðŸ”´ FAKE NEWS")


ðŸŸ¢ REAL NEWS


In [81]:
import pickle

# Save trained Logistic Regression model
with open("logistic_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save trained TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)


In [82]:
from google.colab import files

files.download("logistic_model.pkl")
files.download("tfidf_vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>