In [1]:
# STEP 1: Install NLTK tools
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# STEP 2: Upload files manually before running this code
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

# STEP 3: Read your data
real = pd.read_csv("True.csv")
fake = pd.read_csv("Fake.csv")
real["label"] = 1
fake["label"] = 0

# STEP 4: Combine and shuffle
df = pd.concat([real, fake], ignore_index=True).sample(frac=1, random_state=42)

# STEP 5: Clean the text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean(text):
    text = re.sub(r'[^a-zA-Z]', ' ', str(text))
    words = text.lower().split()
    return ' '.join(lemmatizer.lemmatize(w) for w in words if w not in stop_words)

df['content'] = (df['title'].fillna('') + ' ' + df['text'].fillna('')).apply(clean)

# STEP 6: Vectorize (give numbers to text)
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['content'])
y = df['label']

# STEP 7: Train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# STEP 8: Test it!
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))

# STEP 9: Save model
joblib.dump(model, "fake_news_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Accuracy: 0.987750556792873
Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      4669
           1       0.98      0.99      0.99      4311

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



['tfidf_vectorizer.pkl']