In [11]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m95.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [1]:
import pandas as pd

true_df = pd.read_csv("True.csv")
fake_df = pd.read_csv("Fake.csv")

true_df['label'] = 1
fake_df['label'] = 0

df = pd.concat([true_df, fake_df], ignore_index=True).sample(frac=1).reset_index(drop=True)
df = df[['title', 'text', 'label']]
df['content'] = df['title'] + " " + df['text']


In [2]:
df['content']

Unnamed: 0,content
0,WOW! TOP SPONSORS OF NFL Issue Statements Abou...
1,OBAMA UNHINGED AND ANGRY: “He not doin’ nothin...
2,Senate's McConnell: Draft healthcare bill expe...
3,Big banks' relationship with Dodd-Frank: it's ...
4,Trump taps former NASA head Griffin for deputy...
...,...
44893,U.S. top court seeks more information in contr...
44894,FACEPALM: Gary Johnson Has ‘Aleppo Moment’ – ...
44895,U.S. House panel slams former NSA contractor S...
44896,Exclusive: Moscow lawyer who met Trump Jr. had...


In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))
    stemmer = PorterStemmer()
    text = ' '.join(stemmer.stem(word) for word in text.split())
    return text

df['clean_content'] = df['content'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# WHAT WE DID YET -

| Step                                  | What it Does                                        | Why it Matters                                          |
| ------------------------------------- | --------------------------------------------------- | ------------------------------------------------------- |
| **1. `text.lower()`**                 | Converts all text to lowercase                      | So “News” and “news” are treated the same               |
| **2. `re.sub(r'\[.*?\]', '', text)`** | Removes anything in square brackets like `[source]` | These are usually unnecessary notes or references       |
| **3. Remove URLs**                    | `http://...`, `www...`                              | Links don’t help in understanding the article’s meaning |
| **4. Remove non-words (`\W`)**        | Gets rid of things like `@`, `#`, `!`               | They’re not useful for our model                        |
| **5. Remove punctuation**             | Like `.,!?`                                         | Clean, simple words are easier to work with             |
| **6. Remove numbers**                 | `2020`, `123`                                       | Numbers often don’t help us detect fake vs real news    |
| **7. Remove stopwords**               | Words like "the", "is", "on", "at"                  | These are very common words that don’t add value        |
| **8. Stemming**                       | Turns words like "running", "runner", "ran" → "run" | Groups similar words together to simplify input         |


In [4]:
df['clean_content']

Unnamed: 0,clean_content
0,wow top sponsor nfl issu statement nation anth...
1,obama unhing angri doin nothin
2,senat mcconnel draft healthcar bill expect thu...
3,big bank relationship dodd frank complic reute...
4,trump tap former nasa head griffin deputi defe...
...,...
44893,u top court seek inform contracept insur case ...
44894,facepalm gari johnson aleppo moment name singl...
44895,u hous panel slam former nsa contractor snowde...
44896,exclus moscow lawyer met trump jr russian spi ...


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_content']).toarray()
y = df['label']


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = PassiveAggressiveClassifier(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9935412026726058
Confusion Matrix:
 [[4641   24]
 [  34 4281]]
Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4665
           1       0.99      0.99      0.99      4315

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [8]:
import os

os.makedirs("models", exist_ok=True)


In [9]:
import pickle

with open("models/model.pkl", "wb") as f:
    pickle.dump(model, f)


In [12]:
import streamlit as st
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

with open("models/model.pkl", "rb") as f:
    model = pickle.load(f)

st.title("📰 Fake News Classifier")

user_input = st.text_area("Enter the news article text:")

if st.button("Predict"):
    vectorizer = TfidfVectorizer(max_features=5000)
    transformed_input = vectorizer.fit_transform([user_input])
    prediction = model.predict(transformed_input)
    label = "Fake News 🟥" if prediction[0] == 0 else "Real News 🟩"
    st.success(f"Prediction: {label}")


2025-05-28 07:17:19.287 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-05-28 07:17:19.302 Session state does not function when running a script without `streamlit run`


In [13]:
text = "The government has announced a new policy to reduce carbon emissions."
# After vectorizing and predicting...
prediction = model.predict(tfidf.transform([text]))
print("Real" if prediction[0] == 1 else "Fake")


Fake
