<a href="https://colab.research.google.com/github/adhora7/Fake-News-Detection/blob/main/Fake_News_Detection_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
news_dataset = pd.read_csv('/content/news.csv')

Dataset Diagnosis


In [None]:
print(news_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB
None


In [None]:
print(news_dataset.columns.tolist())

['Unnamed: 0', 'title', 'text', 'label']


In [None]:
print(news_dataset.head())

   Unnamed: 0                                              title  \
0        8476                       You Can Smell Hillary’s Fear   
1       10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2        3608        Kerry to go to Paris in gesture of sympathy   
3       10142  Bernie supporters on Twitter erupt in anger ag...   
4         875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  


In [None]:
print(news_dataset['label'].value_counts())

label
REAL    3171
FAKE    3164
Name: count, dtype: int64


In [None]:
label_mapping = {'REAL': 0, 'FAKE': 1}
news_dataset['label'] = news_dataset['label'].replace(label_mapping)

  news_dataset['label'] = news_dataset['label'].replace(label_mapping)


In [None]:
print(news_dataset['label'].value_counts())

label
0    3171
1    3164
Name: count, dtype: int64


In [None]:
news_dataset = news_dataset.fillna('')

 Checking which columns have actual content

In [None]:
content_cols_to_check = ['title','text']

In [None]:
for column in content_cols_to_check:
    if column in news_dataset.columns:
      non_empty = (news_dataset[column].astype(str).str.strip().str.len() > 0).sum()


      #print(f"\n{column}:")
      print(column + ":")
      print(f"  Non-empty (after fillnull): {non_empty}/{len(news_dataset)}")



title:
  Non-empty (after fillnull): 6335/6335
text:
  Non-empty (after fillnull): 6299/6335


In [None]:

content_column_used = None
if 'text' in news_dataset.columns and (news_dataset['text'].astype(str).str.strip().str.len() > 0).sum() > len(news_dataset) * 0.7: # Check if 'text' has substantial content
    print("\n Using 'text' column for content")
    news_dataset['content'] = news_dataset['text'].astype(str)
    content_column_used = 'text'
elif 'title' in news_dataset.columns and (news_dataset['title'].astype(str).str.strip().str.len() > 0).sum() > len(news_dataset) * 0.7:
     print("\n⚠️ No substantial 'text' column found, using 'title'")
     news_dataset['content'] = news_dataset['title'].astype(str)
     content_column_used = 'title'
elif 'subject' in news_dataset.columns and (news_dataset['subject'].astype(str).str.strip().str.len() > 0).sum() > len(news_dataset) * 0.7:
     print("\n⚠️ No substantial 'text' or 'title' column found, using 'subject'")
     news_dataset['content'] = news_dataset['subject'].astype(str)
     content_column_used = 'subject'


news_dataset['content'] = news_dataset['content'].astype(str).fillna('')


 Using 'text' column for content


In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

def preprocess_text(content):
    if not isinstance(content, str) or len(content) < 10:
        return ""

    # Remove URLs
    content = re.sub(r'http\S+', '', content)
    # Remove email addresses
    content = re.sub(r'\S+@\S+', '', content)
    # Remove special characters, keeping spaces
    content = re.sub('[^a-zA-Z\s]', ' ', content)
    content = content.lower()
    content = content.split()

    # Remove stopwords and short words
    content = [wordnet_lemmatizer.lemmatize(word) for word in content
               if word not in stopwords.words('english') and len(word) > 2]

    return ' '.join(content)

  content = re.sub('[^a-zA-Z\s]', ' ', content)


In [None]:
print(f"\nPreprocessing text from '{content_column_used}' column...")
news_dataset['content'] = news_dataset['content'].apply(preprocess_text)



Preprocessing text from 'text' column...


In [None]:
# Remove documents with insufficient content after preprocessing
min_words = 10  # Lowered minimum words slightly
news_dataset['word_count'] = news_dataset['content'].str.split().str.len()
print(f"\nWord count statistics (after preprocessing):")
print(news_dataset['word_count'].describe())

news_dataset = news_dataset[news_dataset['word_count'] >= min_words]
print(f"\nDataset shape after removing short documents: {news_dataset.shape}")


Word count statistics (after preprocessing):
count    6335.000000
mean      425.133860
std       452.511623
min         0.000000
25%       160.000000
50%       333.000000
75%       560.000000
max      9974.000000
Name: word_count, dtype: float64

Dataset shape after removing short documents: (6187, 6)


In [None]:
# Check content length by label
if not news_dataset.empty:
    print("\nAverage word count by label (after filtering):")
    print(news_dataset.groupby('label')['word_count'].mean())
else:
    print("\nNo data remaining after filtering by word count.")


# Check label distribution after filtering
print("\nLabel Distribution (after filtering short docs):")
print(news_dataset['label'].value_counts())


Average word count by label (after filtering):
label
0    494.974401
1    375.683973
Name: word_count, dtype: float64

Label Distribution (after filtering short docs):
label
1    3101
0    3086
Name: count, dtype: int64


In [None]:
#  Check if labels are meaningful (re-check after filtering)

print("CHECKING LABEL QUALITY (After Filtering):")


if not news_dataset.empty:

    print("\nSample REAL news (label=0):")
    real_samples = news_dataset[news_dataset['label'] == 0]['content'].head(3)
    for i, text in enumerate(real_samples, 1):
        print(f"\nReal {i}: {text[:200]}...") # Print first 200 chars

    print("\n\nSample FAKE news (label=1):")
    fake_samples = news_dataset[news_dataset['label'] == 1]['content'].head(3)
    for i, text in enumerate(fake_samples, 1):
        print(f"\nFake {i}: {text[:200]}...")



CHECKING LABEL QUALITY (After Filtering):

Sample REAL news (label=0):

Real 1: secretary state john kerry said monday stop paris later week amid criticism top american official attended sunday unity march terrorism kerry said expects arrive paris thursday evening head home week ...

Real 2: primary day new york front runner hillary clinton donald trump leading poll trump vowing win enough delegate clinch republican nomination prevent contested convention sen ted cruz texas bernie sander ...

Real 3: czech stockbroker saved jewish child nazi germany died age dubbed britain schindler nicholas winton arranged transport jewish youngster prague germany annexed czechoslovakia march though child origina...


Sample FAKE news (label=1):

Fake 1: daniel greenfield shillman journalism fellow freedom center new york writer focusing radical islam final stretch election hillary rodham clinton gone war fbi word unprecedented thrown around often ele...

Fake 2: google pinterest digg linkedin reddit 

In [None]:
if news_dataset.shape[0] > 0:

    print("TRAINING MODELS")


    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),           # Use 1-2 grams
        max_features=7000,            # Limit features to a reasonable number
        min_df=5,                      # Must appear in at least 5 docs
        max_df=0.8,                   # Ignore terms in >80% of docs
        sublinear_tf=True,
        use_idf=True
    )

    try:
        X = vectorizer.fit_transform(news_dataset['content'])
        Y = news_dataset['label'].values
        print(f"\nFeature matrix shape: {X.shape}")
        print(f"Label distribution:\n{pd.Series(Y).value_counts()}")

        feature_names = vectorizer.get_feature_names_out()
        print(f"\nSample features: {list(feature_names[:20])}")

    except Exception as e:
         print(f"\nError during TF-IDF vectorization: {e}")
         X = None # Indicate failure
         Y = None # Indicate failure



TRAINING MODELS

Feature matrix shape: (6187, 7000)
Label distribution:
1    3101
0    3086
Name: count, dtype: int64

Sample features: ['abandon', 'abandoned', 'abc', 'abc news', 'abdullah', 'abedin', 'ability', 'able', 'aboard', 'abortion', 'abortion right', 'abraham', 'abroad', 'absence', 'absent', 'absentee', 'absolute', 'absolutely', 'absurd', 'abu']


In [None]:
# Proceed with training only if vectorization was successful
if X is not None and Y is not None and X.shape[0] > 0:
    # ===== Split data =====
    # Ensure enough samples are available for splitting
    if X.shape[0] >= 2 and (pd.Series(Y).value_counts() >= 2).all(): # Need at least 2 samples total and at least 2 per class for stratify
        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y,
            test_size=0.2,
            stratify=Y,
            random_state=42
        )

        print(f"\nTraining samples after split: {X_train.shape[0]}")
        print(f"Test samples after split: {X_test.shape[0]}")


Training samples after split: 4949
Test samples after split: 1238


In [None]:
# Convert Y_train and Y_test to flattened integer numpy arrays for scikit-learn compatibility
Y_train_flat = Y_train.astype(int).ravel()
Y_test_flat = Y_test.astype(int).ravel()

In [None]:
# Get feature importance
if hasattr(lr_model, 'coef_') and len(feature_names) > 0:
    feature_importance = np.abs(lr_model.coef_[0])
    # Ensure we don't ask for more features than exist
    num_features_to_show = min(20, len(feature_names))
    top_indices = np.argsort(feature_importance)[-num_features_to_show:]

    print(f"\nTop {num_features_to_show} most important words for classification:")
    # Sort in descending order of importance
    for idx in reversed(top_indices):
        print(f"- {feature_names[idx]}: {feature_importance[idx]:.4f}")
else:
    print("\nCould not determine feature importance (model fitting may have failed or no features).")


Top 20 most important words for classification:
- october: 4.5931
- said: 3.5286
- november: 3.0800
- article: 2.9133
- republican: 2.9121
- hillary: 2.8543
- election: 2.8442
- share: 2.7174
- conservative: 2.6942
- president: 2.3988
- source: 2.1653
- obama: 2.1272
- cruz: 2.0992
- email: 2.0659
- sen: 2.0422
- debate: 2.0290
- say: 2.0154
- via: 2.0147
- attack: 1.9658
- russia: 1.9442


In [None]:
# Evaluate the model
Y_pred = lr_model.predict(X_test)

print("\nModel Evaluation:")
print(f"Accuracy: {accuracy_score(Y_test_flat, Y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(Y_test_flat, Y_pred))


Model Evaluation:
Accuracy: 0.9402

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       617
           1       0.93      0.95      0.94       621

    accuracy                           0.94      1238
   macro avg       0.94      0.94      0.94      1238
weighted avg       0.94      0.94      0.94      1238



IndentationError: unindent does not match any outer indentation level (<tokenize>, line 9)

In [None]:
# Get news text from user input
user_news = input("Enter the news text to check: ")

processed_user_news = preprocess_text(user_news)

if 'vectorizer' in locals() and vectorizer is not None:
    user_news_vector = vectorizer.transform([processed_user_news])

    if 'lr_model' in locals() and lr_model is not None:
        prediction = lr_model.predict(user_news_vector)

        # Output the prediction
        predicted_label = "REAL" if prediction[0] == 0 else "FAKE"
        print(f"The news article is predicted to be: {predicted_label}")
    else:
        print("Error: Model not trained. Please run the training cells.")
else:
    print("Error: TF-IDF vectorizer not fitted. Please run the vectorization cell.")

Enter the news text to check: breaking news,bd's ex pm hasina died
The news article is predicted to be: FAKE
